Add docstrings for parser and NER. Simplify some arguments

This commit is contained in:
Matthew Honnibal 2020-08-09 14:46:13 +02:00
parent 39a3d64c01
commit bbd8acd4bf
2 changed files with 61 additions and 12 deletions

View File

@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"multitasks": [],
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
@ -51,17 +50,47 @@ def make_parser(
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
multitasks: Iterable,
learn_tokens: bool,
min_action_freq: int
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
optionally learn to merge tokens that had been over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using an imitation learning objective. The parser follows
the actions predicted by the current weights, and at each state, determines
which actions are compatible with the optimal parse that could be reached
from the current state. The weights such that the scores assigned to the
set of optimal actions is increased, while scores assigned to other
actions are decreased. Note that more than one action may be optimal for
a given state.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=multitasks,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
)

View File

@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"multitasks": [],
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_NER_MODEL,
},
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
@ -50,19 +47,40 @@ def make_ner(
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
multitasks: Iterable,
learn_tokens: bool,
min_action_freq: int
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
function optimizes for whole entity accuracy, so if your inter-annotator
agreement on boundary tokens is low, the component will likely perform poorly
on your problem. The transition-based algorithm also assumes that the most
decisive information about your entities will be close to their initial tokens.
If your entities are long and characterised by tokens in their middle, the
component will likely do poorly on your task.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (list[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
"""
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=multitasks,
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
multitasks=[],
min_action_freq=1,
learn_tokens=False,
)
@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
TransitionSystem = BiluoPushDown
def add_multitask_objective(self, mt_component):
"""Register another component as a multi-task objective. Experimental."""
self._multitasks.append(mt_component)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
"""Setup multi-task objective components. Experimental and internal."""
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks:
labeller.model.set_dim("nO", len(self.labels))