Add docstrings for parser and NER. Simplify some arguments

2020-08-09 14:46:13 +02:00 · 2020-08-09 14:46:13 +02:00 · bbd8acd4bf
parent 39a3d64c01
commit bbd8acd4bf
2 changed files with 61 additions and 12 deletions
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
    default_config={
        "moves": None,
        "update_with_oracle_cut_size": 100,
-        "multitasks": [],
        "learn_tokens": False,
        "min_action_freq": 30,
        "model": DEFAULT_PARSER_MODEL,
@ -51,17 +50,47 @@ def make_parser(
    model: Model,
    moves: Optional[list],
    update_with_oracle_cut_size: int,
-    multitasks: Iterable,
    learn_tokens: bool,
    min_action_freq: int
 ):
+    """Create a transition-based DependencyParser component. The dependency parser
+    jointly learns sentence segmentation and labelled dependency parsing, and can
+    optionally learn to merge tokens that had been over-segmented by the tokenizer.
+
+    The parser uses a variant of the non-monotonic arc-eager transition-system
+    described by Honnibal and Johnson (2014), with the addition of a "break"
+    transition to perform the sentence segmentation. Nivre's pseudo-projective
+    dependency transformation is used to allow the parser to predict
+    non-projective parses.
+
+    The parser is trained using an imitation learning objective. The parser follows
+    the actions predicted by the current weights, and at each state, determines
+    which actions are compatible with the optimal parse that could be reached
+    from the current state. The weights such that the scores assigned to the
+    set of optimal actions is increased, while scores assigned to other
+    actions are decreased. Note that more than one action may be optimal for
+    a given state.
+
+    update_with_oracle_cut_size (int):
+        During training, cut long sequences into shorter segments by creating
+        intermediate states based on the gold-standard history. The model is
+        not very sensitive to this parameter, so you usually won't need to change
+        it. 100 is a good default.
+    learn_tokens (bool): Whether to learn to merge subtokens that are split
+        relative to the gold standard. Experimental.
+    min_action_freq (int): The minimum frequency of labelled actions to retain.
+        Rarer labelled actions have their label backed-off to "dep". While this
+        primarily affects the label accuracy, it can also affect the attachment
+        structure, as the labels are used to represent the pseudo-projectivity
+        transformation.
+    """
    return DependencyParser(
        nlp.vocab,
        model,
        name,
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
-        multitasks=multitasks,
+        multitasks=[],
        learn_tokens=learn_tokens,
        min_action_freq=min_action_freq
    )
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
    default_config={
        "moves": None,
        "update_with_oracle_cut_size": 100,
-        "multitasks": [],
-        "learn_tokens": False,
-        "min_action_freq": 30,
        "model": DEFAULT_NER_MODEL,
    },
    scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
@ -50,19 +47,40 @@ def make_ner(
    model: Model,
    moves: Optional[list],
    update_with_oracle_cut_size: int,
-    multitasks: Iterable,
-    learn_tokens: bool,
-    min_action_freq: int
 ):
+    """Create a transition-based EntityRecognizer component. The entity recognizer
+    identifies non-overlapping labelled spans of tokens.
+    
+    The transition-based algorithm used encodes certain assumptions that are
+    effective for "traditional" named entity recognition tasks, but may not be
+    a good fit for every span identification problem. Specifically, the loss
+    function optimizes for whole entity accuracy, so if your inter-annotator
+    agreement on boundary tokens is low, the component will likely perform poorly
+    on your problem. The transition-based algorithm also assumes that the most
+    decisive information about your entities will be close to their initial tokens.
+    If your entities are long and characterised by tokens in their middle, the
+    component will likely do poorly on your task.
+
+    model (Model): The model for the transition-based parser. The model needs
+        to have a specific substructure of named components --- see the
+        spacy.ml.tb_framework.TransitionModel for details.
+    moves (list[str]): A list of transition names. Inferred from the data if not
+        provided.
+    update_with_oracle_cut_size (int):
+        During training, cut long sequences into shorter segments by creating
+        intermediate states based on the gold-standard history. The model is
+        not very sensitive to this parameter, so you usually won't need to change
+        it. 100 is a good default.
+    """
    return EntityRecognizer(
        nlp.vocab,
        model,
        name,
        moves=moves,
        update_with_oracle_cut_size=update_with_oracle_cut_size,
-        multitasks=multitasks,
-        learn_tokens=learn_tokens,
-        min_action_freq=min_action_freq
+        multitasks=[],
+        min_action_freq=1,
+        learn_tokens=False,
    )


@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
    TransitionSystem = BiluoPushDown

    def add_multitask_objective(self, mt_component):
+        """Register another component as a multi-task objective. Experimental."""
        self._multitasks.append(mt_component)

    def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+        """Setup multi-task objective components. Experimental and internal."""
        # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
        for labeller in self._multitasks:
            labeller.model.set_dim("nO", len(self.labels))