From 353f0ef8d750b0b96867e1e3f4922389ab8329bb Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:33:54 +0200 Subject: [PATCH] Use disable argument (list) for serialization --- spacy/language.py | 46 ++++++++++-------- website/docs/api/language.jade | 89 +++++++++++++++++++++++++++------- 2 files changed, 97 insertions(+), 38 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index b20bb4617..39e60c017 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -173,13 +173,13 @@ class Language(object): flat_list.append(pipe) self.pipeline = flat_list - def __call__(self, text, **disabled): + def __call__(self, text, disable=[]): """'Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. - **disabled: Elements of the pipeline that should not be run. + disable (list): Names of the pipeline components to disable. RETURNS (Doc): A container for accessing the annotations. EXAMPLE: @@ -190,7 +190,7 @@ class Language(object): doc = self.make_doc(text) for proc in self.pipeline: name = getattr(proc, 'name', None) - if name in disabled and not disabled[name]: + if name in disable: continue proc(doc) return doc @@ -323,7 +323,7 @@ class Language(object): except StopIteration: pass - def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): + def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. @@ -331,7 +331,7 @@ class Language(object): n_threads (int): The number of worker threads to use. If -1, OpenMP will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. - **disabled: Pipeline components to exclude. + disable (list): Names of the pipeline components to disable. YIELDS (Doc): Documents in the order of the original text. EXAMPLE: @@ -343,7 +343,7 @@ class Language(object): docs = texts for proc in self.pipeline: name = getattr(proc, 'name', None) - if name in disabled and not disabled[name]: + if name in disable: continue if hasattr(proc, 'pipe'): docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) @@ -353,12 +353,14 @@ class Language(object): for doc in docs: yield doc - def to_disk(self, path, **exclude): - """Save the current state to a directory. + def to_disk(self, path, disable=[]): + """Save the current state to a directory. If a model is loaded, this + will include the model. path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. - **exclude: Named attributes to prevent from being saved. + disable (list): Nameds of pipeline components to disable and prevent + from being saved. EXAMPLE: >>> nlp.to_disk('/path/to/models') @@ -370,7 +372,7 @@ class Language(object): raise IOError("Output path must be a directory") props = {} for name, value in self.__dict__.items(): - if name in exclude: + if name in disable: continue if hasattr(value, 'to_disk'): value.to_disk(path / name) @@ -379,13 +381,14 @@ class Language(object): with (path / 'props.pickle').open('wb') as file_: dill.dump(props, file_) - def from_disk(self, path, **exclude): + def from_disk(self, path, disable=[]): """Loads state from a directory. Modifies the object in place and - returns it. + returns it. If the saved `Language` object contains a model, the + model will be loaded. path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. - **exclude: Named attributes to prevent from being loaded. + disable (list): Names of the pipeline components to disable. RETURNS (Language): The modified `Language` object. EXAMPLE: @@ -394,35 +397,36 @@ class Language(object): """ path = util.ensure_path(path) for name in path.iterdir(): - if name not in exclude and hasattr(self, str(name)): + if name not in disable and hasattr(self, str(name)): getattr(self, name).from_disk(path / name) with (path / 'props.pickle').open('rb') as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **exclude) + self.from_bytes(bytes_data, disable) return self - def to_bytes(self, **exclude): + def to_bytes(self, disable=[]): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. + disable (list): Nameds of pipeline components to disable and prevent + from being serialized. RETURNS (bytes): The serialized form of the `Language` object. """ props = dict(self.__dict__) - for key in exclude: + for key in disable: if key in props: props.pop(key) return dill.dumps(props, -1) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, disable=[]): """Load state from a binary string. bytes_data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. + disable (list): Names of the pipeline components to disable. RETURNS (Language): The `Language` object. """ props = dill.loads(bytes_data) for key, value in props.items(): - if key not in exclude: + if key not in disable: setattr(self, key, value) return self diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 455165bca..a22bee5f1 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -73,15 +73,26 @@ p +cell The text to be processed. +row - +cell #[code **disabled] - +cell - - +cell Elements of the pipeline that should not be run. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Doc] +cell A container for accessing the annotations. ++infobox("⚠️ Deprecation note") + .o-block + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old doc = nlp(u"I don't want parsed", parse=False) + +h(2, "pipe") Language.pipe +tag method @@ -112,6 +123,13 @@ p +cell int +cell The number of texts to buffer. + +row + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + +footrow +cell yields +cell #[code Doc] @@ -227,8 +245,11 @@ p +h(2, "to_disk") Language.to_disk +tag method + +tag-new(2) -p Save the current state to a directory. +p + | Save the current state to a directory. If a model is loaded, this will + | #[strong include the model]. +aside-code("Example"). nlp.to_disk('/path/to/models') @@ -242,14 +263,21 @@ p Save the current state to a directory. | Paths may be either strings or #[code Path]-like objects. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being saved. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | and prevent from being saved. +h(2, "from_disk") Language.from_disk +tag method + +tag-new(2) -p Loads state from a directory. Modifies the object in place and returns it. +p + | Loads state from a directory. Modifies the object in place and returns + | it. If the saved #[code Language] object contains a model, the + | #[strong model will be loaded]. +aside-code("Example"). from spacy.language import Language @@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it. | #[code Path]-like objects. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell The modified #[code Language] object. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy v2.0, the #[code save_to_directory] method has been + | renamed to #[code to_disk], to improve consistency across classes. + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new nlp = English().from_disk(disable=['tagger', 'ner']) + +code-old nlp = spacy.load('en', tagger=False, entity=False) + +h(2, "to_bytes") Language.to_bytes +tag method @@ -283,9 +324,12 @@ p Serialize the current state to a binary string. +table(["Name", "Type", "Description"]) +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being serialized. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | and prevent from being serialized. +footrow +cell returns @@ -310,15 +354,26 @@ p Load state from a binary string. +cell The data to load from. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell The #[code Language] object. ++infobox("⚠️ Deprecation note") + .o-block + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner']) + +code-old nlp = English().from_bytes('en', tagger=False, entity=False) + +h(2, "attributes") Attributes +table(["Name", "Type", "Description"])