From bbd8acd4bfccf144938fc6d1595db47aa2d46bde Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2020 14:46:13 +0200 Subject: [PATCH] Add docstrings for parser and NER. Simplify some arguments --- spacy/pipeline/dep_parser.pyx | 35 +++++++++++++++++++++++++++++--- spacy/pipeline/ner.pyx | 38 ++++++++++++++++++++++++++--------- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index f85b5626a..072c334e2 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] default_config={ "moves": None, "update_with_oracle_cut_size": 100, - "multitasks": [], "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, @@ -51,17 +50,47 @@ def make_parser( model: Model, moves: Optional[list], update_with_oracle_cut_size: int, - multitasks: Iterable, learn_tokens: bool, min_action_freq: int ): + """Create a transition-based DependencyParser component. The dependency parser + jointly learns sentence segmentation and labelled dependency parsing, and can + optionally learn to merge tokens that had been over-segmented by the tokenizer. + + The parser uses a variant of the non-monotonic arc-eager transition-system + described by Honnibal and Johnson (2014), with the addition of a "break" + transition to perform the sentence segmentation. Nivre's pseudo-projective + dependency transformation is used to allow the parser to predict + non-projective parses. + + The parser is trained using an imitation learning objective. The parser follows + the actions predicted by the current weights, and at each state, determines + which actions are compatible with the optimal parse that could be reached + from the current state. The weights such that the scores assigned to the + set of optimal actions is increased, while scores assigned to other + actions are decreased. Note that more than one action may be optimal for + a given state. + + update_with_oracle_cut_size (int): + During training, cut long sequences into shorter segments by creating + intermediate states based on the gold-standard history. The model is + not very sensitive to this parameter, so you usually won't need to change + it. 100 is a good default. + learn_tokens (bool): Whether to learn to merge subtokens that are split + relative to the gold standard. Experimental. + min_action_freq (int): The minimum frequency of labelled actions to retain. + Rarer labelled actions have their label backed-off to "dep". While this + primarily affects the label accuracy, it can also affect the attachment + structure, as the labels are used to represent the pseudo-projectivity + transformation. + """ return DependencyParser( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, - multitasks=multitasks, + multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq ) diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index d13152a4f..a3bc3d920 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] default_config={ "moves": None, "update_with_oracle_cut_size": 100, - "multitasks": [], - "learn_tokens": False, - "min_action_freq": 30, "model": DEFAULT_NER_MODEL, }, scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], @@ -50,19 +47,40 @@ def make_ner( model: Model, moves: Optional[list], update_with_oracle_cut_size: int, - multitasks: Iterable, - learn_tokens: bool, - min_action_freq: int ): + """Create a transition-based EntityRecognizer component. The entity recognizer + identifies non-overlapping labelled spans of tokens. + + The transition-based algorithm used encodes certain assumptions that are + effective for "traditional" named entity recognition tasks, but may not be + a good fit for every span identification problem. Specifically, the loss + function optimizes for whole entity accuracy, so if your inter-annotator + agreement on boundary tokens is low, the component will likely perform poorly + on your problem. The transition-based algorithm also assumes that the most + decisive information about your entities will be close to their initial tokens. + If your entities are long and characterised by tokens in their middle, the + component will likely do poorly on your task. + + model (Model): The model for the transition-based parser. The model needs + to have a specific substructure of named components --- see the + spacy.ml.tb_framework.TransitionModel for details. + moves (list[str]): A list of transition names. Inferred from the data if not + provided. + update_with_oracle_cut_size (int): + During training, cut long sequences into shorter segments by creating + intermediate states based on the gold-standard history. The model is + not very sensitive to this parameter, so you usually won't need to change + it. 100 is a good default. + """ return EntityRecognizer( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, - multitasks=multitasks, - learn_tokens=learn_tokens, - min_action_freq=min_action_freq + multitasks=[], + min_action_freq=1, + learn_tokens=False, ) @@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser): TransitionSystem = BiluoPushDown def add_multitask_objective(self, mt_component): + """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + """Setup multi-task objective components. Experimental and internal.""" # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels))