Use disable argument (list) for serialization

This commit is contained in:
ines 2017-05-26 12:33:54 +02:00
parent faff1c23fb
commit 353f0ef8d7
2 changed files with 97 additions and 38 deletions

View File

@ -173,13 +173,13 @@ class Language(object):
flat_list.append(pipe)
self.pipeline = flat_list
def __call__(self, text, **disabled):
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run.
disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations.
EXAMPLE:
@ -190,7 +190,7 @@ class Language(object):
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
proc(doc)
return doc
@ -323,7 +323,7 @@ class Language(object):
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
@ -331,7 +331,7 @@ class Language(object):
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
EXAMPLE:
@ -343,7 +343,7 @@ class Language(object):
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if name in disable:
continue
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -353,12 +353,14 @@ class Language(object):
for doc in docs:
yield doc
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
def to_disk(self, path, disable=[]):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved.
disable (list): Nameds of pipeline components to disable and prevent
from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
@ -370,7 +372,7 @@ class Language(object):
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in exclude:
if name in disable:
continue
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
@ -379,13 +381,14 @@ class Language(object):
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
def from_disk(self, path, **exclude):
def from_disk(self, path, disable=[]):
"""Loads state from a directory. Modifies the object in place and
returns it.
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
@ -394,35 +397,36 @@ class Language(object):
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)):
if name not in disable and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
self.from_bytes(bytes_data, disable)
return self
def to_bytes(self, **exclude):
def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in exclude:
for key in disable:
if key in props:
props.pop(key)
return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude):
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
if key not in disable:
setattr(self, key, value)
return self

View File

@ -73,15 +73,26 @@ p
+cell The text to be processed.
+row
+cell #[code **disabled]
+cell -
+cell Elements of the pipeline that should not be run.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Doc]
+cell A container for accessing the annotations.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old doc = nlp(u"I don't want parsed", parse=False)
+h(2, "pipe") Language.pipe
+tag method
@ -112,6 +123,13 @@ p
+cell int
+cell The number of texts to buffer.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell yields
+cell #[code Doc]
@ -227,8 +245,11 @@ p
+h(2, "to_disk") Language.to_disk
+tag method
+tag-new(2)
p Save the current state to a directory.
p
| Save the current state to a directory. If a model is loaded, this will
| #[strong include the model].
+aside-code("Example").
nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being saved.
+h(2, "from_disk") Language.from_disk
+tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it.
p
| Loads state from a directory. Modifies the object in place and returns
| it. If the saved #[code Language] object contains a model, the
| #[strong model will be loaded].
+aside-code("Example").
from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
| #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell The modified #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy v2.0, the #[code save_to_directory] method has been
| renamed to #[code to_disk], to improve consistency across classes.
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+code-old nlp = spacy.load('en', tagger=False, entity=False)
+h(2, "to_bytes") Language.to_bytes
+tag method
@ -283,9 +324,12 @@ p Serialize the current state to a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being serialized.
+footrow
+cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow
+cell returns
+cell #[code Language]
+cell The #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])