Use disable argument (list) for serialization

This commit is contained in:
ines 2017-05-26 12:33:54 +02:00
parent faff1c23fb
commit 353f0ef8d7
2 changed files with 97 additions and 38 deletions

View File

@ -173,13 +173,13 @@ class Language(object):
flat_list.append(pipe) flat_list.append(pipe)
self.pipeline = flat_list self.pipeline = flat_list
def __call__(self, text, **disabled): def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences, """'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
text (unicode): The text to be processed. text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run. disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations. RETURNS (Doc): A container for accessing the annotations.
EXAMPLE: EXAMPLE:
@ -190,7 +190,7 @@ class Language(object):
doc = self.make_doc(text) doc = self.make_doc(text)
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disable:
continue continue
proc(doc) proc(doc)
return doc return doc
@ -323,7 +323,7 @@ class Language(object):
except StopIteration: except StopIteration:
pass pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports """Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading. GIL-free multi-threading.
@ -331,7 +331,7 @@ class Language(object):
n_threads (int): The number of worker threads to use. If -1, OpenMP will n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2. decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer. batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude. disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text. YIELDS (Doc): Documents in the order of the original text.
EXAMPLE: EXAMPLE:
@ -343,7 +343,7 @@ class Language(object):
docs = texts docs = texts
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disable:
continue continue
if hasattr(proc, 'pipe'): if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -353,12 +353,14 @@ class Language(object):
for doc in docs: for doc in docs:
yield doc yield doc
def to_disk(self, path, **exclude): def to_disk(self, path, disable=[]):
"""Save the current state to a directory. """Save the current state to a directory. If a model is loaded, this
will include the model.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved. disable (list): Nameds of pipeline components to disable and prevent
from being saved.
EXAMPLE: EXAMPLE:
>>> nlp.to_disk('/path/to/models') >>> nlp.to_disk('/path/to/models')
@ -370,7 +372,7 @@ class Language(object):
raise IOError("Output path must be a directory") raise IOError("Output path must be a directory")
props = {} props = {}
for name, value in self.__dict__.items(): for name, value in self.__dict__.items():
if name in exclude: if name in disable:
continue continue
if hasattr(value, 'to_disk'): if hasattr(value, 'to_disk'):
value.to_disk(path / name) value.to_disk(path / name)
@ -379,13 +381,14 @@ class Language(object):
with (path / 'props.pickle').open('wb') as file_: with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_) dill.dump(props, file_)
def from_disk(self, path, **exclude): def from_disk(self, path, disable=[]):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects. strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded. disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object. RETURNS (Language): The modified `Language` object.
EXAMPLE: EXAMPLE:
@ -394,35 +397,36 @@ class Language(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)): if name not in disable and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name) getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_: with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read() bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude) self.from_bytes(bytes_data, disable)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized. disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object. RETURNS (bytes): The serialized form of the `Language` object.
""" """
props = dict(self.__dict__) props = dict(self.__dict__)
for key in exclude: for key in disable:
if key in props: if key in props:
props.pop(key) props.pop(key)
return dill.dumps(props, -1) return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string. """Load state from a binary string.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded. disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object. RETURNS (Language): The `Language` object.
""" """
props = dill.loads(bytes_data) props = dill.loads(bytes_data)
for key, value in props.items(): for key, value in props.items():
if key not in exclude: if key not in disable:
setattr(self, key, value) setattr(self, key, value)
return self return self

View File

@ -73,15 +73,26 @@ p
+cell The text to be processed. +cell The text to be processed.
+row +row
+cell #[code **disabled] +cell #[code disable]
+cell - +cell list
+cell Elements of the pipeline that should not be run. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Doc] +cell #[code Doc]
+cell A container for accessing the annotations. +cell A container for accessing the annotations.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+code-old doc = nlp(u"I don't want parsed", parse=False)
+h(2, "pipe") Language.pipe +h(2, "pipe") Language.pipe
+tag method +tag method
@ -112,6 +123,13 @@ p
+cell int +cell int
+cell The number of texts to buffer. +cell The number of texts to buffer.
+row
+cell #[code disable]
+cell list
+cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell yields +cell yields
+cell #[code Doc] +cell #[code Doc]
@ -227,8 +245,11 @@ p
+h(2, "to_disk") Language.to_disk +h(2, "to_disk") Language.to_disk
+tag method +tag method
+tag-new(2)
p Save the current state to a directory. p
| Save the current state to a directory. If a model is loaded, this will
| #[strong include the model].
+aside-code("Example"). +aside-code("Example").
nlp.to_disk('/path/to/models') nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
| Paths may be either strings or #[code Path]-like objects. | Paths may be either strings or #[code Path]-like objects.
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being saved. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being saved.
+h(2, "from_disk") Language.from_disk +h(2, "from_disk") Language.from_disk
+tag method +tag method
+tag-new(2)
p Loads state from a directory. Modifies the object in place and returns it. p
| Loads state from a directory. Modifies the object in place and returns
| it. If the saved #[code Language] object contains a model, the
| #[strong model will be loaded].
+aside-code("Example"). +aside-code("Example").
from spacy.language import Language from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
| #[code Path]-like objects. | #[code Path]-like objects.
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being loaded. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell The modified #[code Language] object. +cell The modified #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| As of spaCy v2.0, the #[code save_to_directory] method has been
| renamed to #[code to_disk], to improve consistency across classes.
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+code-old nlp = spacy.load('en', tagger=False, entity=False)
+h(2, "to_bytes") Language.to_bytes +h(2, "to_bytes") Language.to_bytes
+tag method +tag method
@ -283,9 +324,12 @@ p Serialize the current state to a binary string.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being serialized. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
| and prevent from being serialized.
+footrow +footrow
+cell returns +cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
+cell The data to load from. +cell The data to load from.
+row +row
+cell #[code **exclude] +cell #[code disable]
+cell - +cell list
+cell Named attributes to prevent from being loaded. +cell
| Names of pipeline components to
| #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+footrow +footrow
+cell returns +cell returns
+cell #[code Language] +cell #[code Language]
+cell The #[code Language] object. +cell The #[code Language] object.
+infobox("⚠️ Deprecation note")
.o-block
| Pipeline components to prevent from being loaded can now be added as
| a list to #[code disable], instead of specifying one keyword argument
| per component.
+code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])