mirror of https://github.com/explosion/spaCy.git
Add docstrings for parser and NER. Simplify some arguments
This commit is contained in:
parent
39a3d64c01
commit
bbd8acd4bf
|
@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
default_config={
|
||||
"moves": None,
|
||||
"update_with_oracle_cut_size": 100,
|
||||
"multitasks": [],
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
|
@ -51,17 +50,47 @@ def make_parser(
|
|||
model: Model,
|
||||
moves: Optional[list],
|
||||
update_with_oracle_cut_size: int,
|
||||
multitasks: Iterable,
|
||||
learn_tokens: bool,
|
||||
min_action_freq: int
|
||||
):
|
||||
"""Create a transition-based DependencyParser component. The dependency parser
|
||||
jointly learns sentence segmentation and labelled dependency parsing, and can
|
||||
optionally learn to merge tokens that had been over-segmented by the tokenizer.
|
||||
|
||||
The parser uses a variant of the non-monotonic arc-eager transition-system
|
||||
described by Honnibal and Johnson (2014), with the addition of a "break"
|
||||
transition to perform the sentence segmentation. Nivre's pseudo-projective
|
||||
dependency transformation is used to allow the parser to predict
|
||||
non-projective parses.
|
||||
|
||||
The parser is trained using an imitation learning objective. The parser follows
|
||||
the actions predicted by the current weights, and at each state, determines
|
||||
which actions are compatible with the optimal parse that could be reached
|
||||
from the current state. The weights such that the scores assigned to the
|
||||
set of optimal actions is increased, while scores assigned to other
|
||||
actions are decreased. Note that more than one action may be optimal for
|
||||
a given state.
|
||||
|
||||
update_with_oracle_cut_size (int):
|
||||
During training, cut long sequences into shorter segments by creating
|
||||
intermediate states based on the gold-standard history. The model is
|
||||
not very sensitive to this parameter, so you usually won't need to change
|
||||
it. 100 is a good default.
|
||||
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
||||
relative to the gold standard. Experimental.
|
||||
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
||||
Rarer labelled actions have their label backed-off to "dep". While this
|
||||
primarily affects the label accuracy, it can also affect the attachment
|
||||
structure, as the labels are used to represent the pseudo-projectivity
|
||||
transformation.
|
||||
"""
|
||||
return DependencyParser(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
moves=moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
multitasks=multitasks,
|
||||
multitasks=[],
|
||||
learn_tokens=learn_tokens,
|
||||
min_action_freq=min_action_freq
|
||||
)
|
||||
|
|
|
@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
default_config={
|
||||
"moves": None,
|
||||
"update_with_oracle_cut_size": 100,
|
||||
"multitasks": [],
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_NER_MODEL,
|
||||
},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
|
@ -50,19 +47,40 @@ def make_ner(
|
|||
model: Model,
|
||||
moves: Optional[list],
|
||||
update_with_oracle_cut_size: int,
|
||||
multitasks: Iterable,
|
||||
learn_tokens: bool,
|
||||
min_action_freq: int
|
||||
):
|
||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||
identifies non-overlapping labelled spans of tokens.
|
||||
|
||||
The transition-based algorithm used encodes certain assumptions that are
|
||||
effective for "traditional" named entity recognition tasks, but may not be
|
||||
a good fit for every span identification problem. Specifically, the loss
|
||||
function optimizes for whole entity accuracy, so if your inter-annotator
|
||||
agreement on boundary tokens is low, the component will likely perform poorly
|
||||
on your problem. The transition-based algorithm also assumes that the most
|
||||
decisive information about your entities will be close to their initial tokens.
|
||||
If your entities are long and characterised by tokens in their middle, the
|
||||
component will likely do poorly on your task.
|
||||
|
||||
model (Model): The model for the transition-based parser. The model needs
|
||||
to have a specific substructure of named components --- see the
|
||||
spacy.ml.tb_framework.TransitionModel for details.
|
||||
moves (list[str]): A list of transition names. Inferred from the data if not
|
||||
provided.
|
||||
update_with_oracle_cut_size (int):
|
||||
During training, cut long sequences into shorter segments by creating
|
||||
intermediate states based on the gold-standard history. The model is
|
||||
not very sensitive to this parameter, so you usually won't need to change
|
||||
it. 100 is a good default.
|
||||
"""
|
||||
return EntityRecognizer(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
moves=moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
multitasks=multitasks,
|
||||
learn_tokens=learn_tokens,
|
||||
min_action_freq=min_action_freq
|
||||
multitasks=[],
|
||||
min_action_freq=1,
|
||||
learn_tokens=False,
|
||||
)
|
||||
|
||||
|
||||
|
@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
|
|||
TransitionSystem = BiluoPushDown
|
||||
|
||||
def add_multitask_objective(self, mt_component):
|
||||
"""Register another component as a multi-task objective. Experimental."""
|
||||
self._multitasks.append(mt_component)
|
||||
|
||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||
"""Setup multi-task objective components. Experimental and internal."""
|
||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||
for labeller in self._multitasks:
|
||||
labeller.model.set_dim("nO", len(self.labels))
|
||||
|
|
Loading…
Reference in New Issue