Use disable argument (list) for serialization

2017-05-26 12:33:54 +02:00 · 2017-05-26 12:33:54 +02:00 · 353f0ef8d7
parent faff1c23fb
commit 353f0ef8d7
2 changed files with 97 additions and 38 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -173,13 +173,13 @@ class Language(object):
                flat_list.append(pipe)
        self.pipeline = flat_list

-    def __call__(self, text, **disabled):
+    def __call__(self, text, disable=[]):
        """'Apply the pipeline to some text. The text can span multiple sentences,
        and can contain arbtrary whitespace. Alignment into the original string
        is preserved.

        text (unicode): The text to be processed.
-        **disabled: Elements of the pipeline that should not be run.
+        disable (list): Names of the pipeline components to disable.
        RETURNS (Doc): A container for accessing the annotations.

        EXAMPLE:
@ -190,7 +190,7 @@ class Language(object):
        doc = self.make_doc(text)
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
-            if name in disabled and not disabled[name]:
+            if name in disable:
                continue
            proc(doc)
        return doc
@ -323,7 +323,7 @@ class Language(object):
            except StopIteration:
                pass

-    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
+    def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]):
        """Process texts as a stream, and yield `Doc` objects in order. Supports
        GIL-free multi-threading.

@ -331,7 +331,7 @@ class Language(object):
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
-        **disabled: Pipeline components to exclude.
+        disable (list): Names of the pipeline components to disable.
        YIELDS (Doc): Documents in the order of the original text.

        EXAMPLE:
@ -343,7 +343,7 @@ class Language(object):
        docs = texts
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
-            if name in disabled and not disabled[name]:
+            if name in disable:
                continue
            if hasattr(proc, 'pipe'):
                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
@ -353,12 +353,14 @@ class Language(object):
        for doc in docs:
            yield doc

-    def to_disk(self, path, **exclude):
-        """Save the current state to a directory.
+    def to_disk(self, path, disable=[]):
+        """Save the current state to a directory.  If a model is loaded, this
+        will include the model.

        path (unicode or Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or `Path`-like objects.
-        **exclude: Named attributes to prevent from being saved.
+        disable (list): Nameds of pipeline components to disable and prevent
+            from being saved.

        EXAMPLE:
            >>> nlp.to_disk('/path/to/models')
@ -370,7 +372,7 @@ class Language(object):
            raise IOError("Output path must be a directory")
        props = {}
        for name, value in self.__dict__.items():
-            if name in exclude:
+            if name in disable:
                continue
            if hasattr(value, 'to_disk'):
                value.to_disk(path / name)
@ -379,13 +381,14 @@ class Language(object):
        with (path / 'props.pickle').open('wb') as file_:
            dill.dump(props, file_)

-    def from_disk(self, path, **exclude):
+    def from_disk(self, path, disable=[]):
        """Loads state from a directory. Modifies the object in place and
-        returns it.
+        returns it. If the saved `Language` object contains a model, the
+        model will be loaded.

        path (unicode or Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
-        **exclude: Named attributes to prevent from being loaded.
+        disable (list): Names of the pipeline components to disable.
        RETURNS (Language): The modified `Language` object.

        EXAMPLE:
@ -394,35 +397,36 @@ class Language(object):
        """
        path = util.ensure_path(path)
        for name in path.iterdir():
-            if name not in exclude and hasattr(self, str(name)):
+            if name not in disable and hasattr(self, str(name)):
                getattr(self, name).from_disk(path / name)
        with (path / 'props.pickle').open('rb') as file_:
            bytes_data = file_.read()
-        self.from_bytes(bytes_data, **exclude)
+        self.from_bytes(bytes_data, disable)
        return self

-    def to_bytes(self, **exclude):
+    def to_bytes(self, disable=[]):
        """Serialize the current state to a binary string.

-        **exclude: Named attributes to prevent from being serialized.
+        disable (list): Nameds of pipeline components to disable and prevent
+            from being serialized.
        RETURNS (bytes): The serialized form of the `Language` object.
        """
        props = dict(self.__dict__)
-        for key in exclude:
+        for key in disable:
            if key in props:
                props.pop(key)
        return dill.dumps(props, -1)

-    def from_bytes(self, bytes_data, **exclude):
+    def from_bytes(self, bytes_data, disable=[]):
        """Load state from a binary string.

        bytes_data (bytes): The data to load from.
-        **exclude: Named attributes to prevent from being loaded.
+        disable (list): Names of the pipeline components to disable.
        RETURNS (Language): The `Language` object.
        """
        props = dill.loads(bytes_data)
        for key, value in props.items():
-            if key not in exclude:
+            if key not in disable:
                setattr(self, key, value)
        return self

--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -73,15 +73,26 @@ p
        +cell The text to be processed.

    +row
-        +cell #[code **disabled]
-        +cell -
-        +cell Elements of the pipeline that should not be run.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Doc]
        +cell A container for accessing the annotations.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  Pipeline components to prevent from being loaded can now be added as
+        |  a list to #[code disable], instead of specifying one keyword argument
+        |  per component.
+
+    +code-new doc = nlp(u"I don't want parsed", disable=['parser'])
+    +code-old doc = nlp(u"I don't want parsed", parse=False)
+
 +h(2, "pipe") Language.pipe
    +tag method

@ -112,6 +123,13 @@ p
        +cell int
        +cell The number of texts to buffer.

+    +row
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].
+
    +footrow
        +cell yields
        +cell #[code Doc]
@ -227,8 +245,11 @@ p

 +h(2, "to_disk") Language.to_disk
    +tag method
+    +tag-new(2)

-p Save the current state to a directory.
+p
+    |  Save the current state to a directory. If a model is loaded, this will
+    |  #[strong include the model].

 +aside-code("Example").
    nlp.to_disk('/path/to/models')
@ -242,14 +263,21 @@ p Save the current state to a directory.
            |  Paths may be either strings or #[code Path]-like objects.

    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being saved.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
+            |  and prevent from being saved.

 +h(2, "from_disk") Language.from_disk
    +tag method
+    +tag-new(2)

-p Loads state from a directory. Modifies the object in place and returns it.
+p
+    |  Loads state from a directory. Modifies the object in place and returns
+    |  it. If the saved #[code Language] object contains a model, the
+    |  #[strong model will be loaded].

 +aside-code("Example").
    from spacy.language import Language
@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it.
            |  #[code Path]-like objects.

    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being loaded.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Language]
        +cell The modified #[code Language] object.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  As of spaCy v2.0, the #[code save_to_directory] method has been
+        |  renamed to #[code to_disk], to improve consistency across classes.
+        |  Pipeline components to prevent from being loaded can now be added as
+        |  a list to #[code disable], instead of specifying one keyword argument
+        |  per component.
+
+    +code-new nlp = English().from_disk(disable=['tagger', 'ner'])
+    +code-old nlp = spacy.load('en', tagger=False, entity=False)
+
 +h(2, "to_bytes") Language.to_bytes
    +tag method

@ -283,9 +324,12 @@ p Serialize the current state to a binary string.

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being serialized.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable]
+            |  and prevent from being serialized.

    +footrow
        +cell returns
@ -310,15 +354,26 @@ p Load state from a binary string.
        +cell The data to load from.

    +row
-        +cell #[code **exclude]
-        +cell -
-        +cell Named attributes to prevent from being loaded.
+        +cell #[code disable]
+        +cell list
+        +cell
+            |  Names of pipeline components to
+            |  #[+a("/docs/usage/language-processing-pipeline#disabling") disable].

    +footrow
        +cell returns
        +cell #[code Language]
        +cell The #[code Language] object.

+infobox("⚠️ Deprecation note")
+    .o-block
+        |  Pipeline components to prevent from being loaded can now be added as
+        |  a list to #[code disable], instead of specifying one keyword argument
+        |  per component.
+
+    +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner'])
+    +code-old nlp = English().from_bytes('en', tagger=False, entity=False)
+
 +h(2, "attributes") Attributes

 +table(["Name", "Type", "Description"])