mirror of https://github.com/explosion/spaCy.git
Tidy up and auto-format
This commit is contained in:
parent
2a4d56e730
commit
e68459296d
|
@ -7,8 +7,6 @@ import typer
|
|||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides
|
||||
from .. import util
|
||||
from ..lang.en import English
|
||||
from ..util import dot_to_object
|
||||
|
||||
|
||||
@debug_cli.command("model")
|
||||
|
@ -130,8 +128,8 @@ def _sentences():
|
|||
]
|
||||
|
||||
|
||||
def _get_docs():
|
||||
nlp = English()
|
||||
def _get_docs(lang: str = "en"):
|
||||
nlp = util.get_lang_class(lang)()
|
||||
return list(nlp.pipe(_sentences()))
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional, List, Dict
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import Printer
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import tqdm
|
||||
import re
|
||||
import shutil
|
||||
import requests
|
||||
|
|
|
@ -1,14 +1,8 @@
|
|||
from .corpus import Corpus
|
||||
from .example import Example
|
||||
from .align import Alignment
|
||||
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob
|
||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from .iob_utils import spans_from_biluo_tags
|
||||
from .iob_utils import tags_to_entities
|
||||
|
||||
from .gold_io import docs_to_json
|
||||
from .gold_io import read_json_file
|
||||
|
||||
|
||||
from .batchers import minibatch_by_padded_size, minibatch_by_words
|
||||
from .corpus import Corpus # noqa: F401
|
||||
from .example import Example # noqa: F401
|
||||
from .align import Alignment # noqa: F401
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
|
||||
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
|
||||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||
|
|
|
@ -3,7 +3,6 @@ from typing import Optional, Any
|
|||
from functools import partial
|
||||
import itertools
|
||||
|
||||
from .example import Example
|
||||
from ..util import registry, minibatch
|
||||
|
||||
|
||||
|
@ -41,16 +40,13 @@ def configure_minibatch_by_words(
|
|||
) -> BatcherT:
|
||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||
return partial(
|
||||
minibatch_by_words,
|
||||
size=size,
|
||||
discard_oversize=discard_oversize,
|
||||
**optionals
|
||||
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
|
||||
)
|
||||
|
||||
|
||||
@registry.batchers("batch_by_sequence.v1")
|
||||
def configure_minibatch(size: Sizing, get_length=None) -> BatcherT:
|
||||
optionals = ({"get_length": get_length} if get_length is not None else {})
|
||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||
return partial(minibatch, size=size, **optionals)
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .iob2docs import iob2docs # noqa: F401
|
||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||
from .json2docs import json2docs
|
||||
from .json2docs import json2docs # noqa: F401
|
||||
from .conllu2docs import conllu2docs # noqa: F401
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable, Tuple
|
||||
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
|
||||
from pathlib import Path
|
||||
import random
|
||||
|
||||
from .. import util
|
||||
from .example import Example
|
||||
|
@ -25,7 +24,7 @@ class Corpus:
|
|||
|
||||
path (Path): The directory or filename to read from.
|
||||
gold_preproc (bool): Whether to set up the Example object with gold-standard
|
||||
sentences and tokens for the predictions. Gold preprocessing helps
|
||||
sentences and tokens for the predictions. Gold preprocessing helps
|
||||
the annotations align to the tokenization, and may result in sequences
|
||||
of more consistent length. However, it may reduce run-time accuracy due
|
||||
to train/test skew. Defaults to False.
|
||||
|
@ -39,7 +38,12 @@ class Corpus:
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self, path, *, limit: int = 0, gold_preproc: bool = False, max_length: bool = False,
|
||||
self,
|
||||
path,
|
||||
*,
|
||||
limit: int = 0,
|
||||
gold_preproc: bool = False,
|
||||
max_length: bool = False,
|
||||
) -> None:
|
||||
self.path = util.ensure_path(path)
|
||||
self.gold_preproc = gold_preproc
|
||||
|
|
|
@ -80,7 +80,7 @@ def _get_transition_table(
|
|||
B_start, B_end = (0, n_labels)
|
||||
I_start, I_end = (B_end, B_end + n_labels)
|
||||
L_start, L_end = (I_end, I_end + n_labels)
|
||||
U_start, _ = (L_end, L_end + n_labels)
|
||||
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
|
||||
# Using ranges allows us to set specific cells, which is necessary to express
|
||||
# that only actions of the same label are valid continuations.
|
||||
B_range = numpy.arange(B_start, B_end)
|
||||
|
|
|
@ -17,9 +17,7 @@ MatcherPatternType = List[Dict[Union[int, str], Any]]
|
|||
AttributeRulerPatternType = Dict[str, Union[MatcherPatternType, Dict, int]]
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"attribute_ruler",
|
||||
)
|
||||
@Language.factory("attribute_ruler")
|
||||
def make_attribute_ruler(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
|
@ -58,7 +56,7 @@ class AttributeRuler(Pipe):
|
|||
self.vocab = vocab
|
||||
self.matcher = Matcher(self.vocab)
|
||||
self.attrs = []
|
||||
self._attrs_unnormed = [] # store for reference
|
||||
self._attrs_unnormed = [] # store for reference
|
||||
self.indices = []
|
||||
|
||||
if pattern_dicts:
|
||||
|
|
|
@ -1,17 +1,23 @@
|
|||
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type
|
||||
from typing import Iterable, TypeVar
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
from pydantic import root_validator
|
||||
from collections import defaultdict
|
||||
from thinc.api import Optimizer
|
||||
from pathlib import Path
|
||||
|
||||
from .attrs import NAMES
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
from .language import Language # noqa: F401
|
||||
from .gold import Example # noqa: F401
|
||||
|
||||
|
||||
ItemT = TypeVar("ItemT")
|
||||
Batcher = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||
Reader = Callable[["Language", str], Iterable["Example"]]
|
||||
|
||||
|
||||
def validate(schema: Type[BaseModel], obj: Dict[str, Any]) -> List[str]:
|
||||
|
@ -183,7 +189,6 @@ class ModelMetaSchema(BaseModel):
|
|||
# check that against this schema in the test suite to make sure it's always
|
||||
# up to date.
|
||||
|
||||
Reader = Callable[["Language", str], Iterable["Example"]]
|
||||
|
||||
class ConfigSchemaTraining(BaseModel):
|
||||
# fmt: off
|
||||
|
@ -209,7 +214,6 @@ class ConfigSchemaTraining(BaseModel):
|
|||
extra = "forbid"
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
#eval_batch_size: StrictInt = Field(..., title="Evaluation batch size")
|
||||
|
||||
class ConfigSchemaNlp(BaseModel):
|
||||
# fmt: off
|
||||
|
|
|
@ -291,6 +291,6 @@ def test_span_boundaries(doc):
|
|||
for i in range(start, end):
|
||||
assert span[i - start] == doc[i]
|
||||
with pytest.raises(IndexError):
|
||||
_ = span[-5]
|
||||
span[-5]
|
||||
with pytest.raises(IndexError):
|
||||
_ = span[5]
|
||||
span[5]
|
||||
|
|
|
@ -29,9 +29,7 @@ def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
|||
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||||
nlp = Chinese(
|
||||
meta={
|
||||
"tokenizer": {
|
||||
"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine",}
|
||||
}
|
||||
"tokenizer": {"config": {"segmenter": "pkuseg", "pkuseg_model": "medicine"}}
|
||||
}
|
||||
)
|
||||
zh_tokenizer_serialize(nlp.tokenizer)
|
||||
|
|
|
@ -21,7 +21,7 @@ re_pattern5 = "B*A*B"
|
|||
longest1 = "A A A A A"
|
||||
longest2 = "A A A A A"
|
||||
longest3 = "A A"
|
||||
longest4 = "B A A A A A B" # "FIRST" would be "B B"
|
||||
longest4 = "B A A A A A B" # "FIRST" would be "B B"
|
||||
longest5 = "B B A A A A A B"
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc
|
||||
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree
|
||||
from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle
|
||||
from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree, is_nonproj_arc
|
||||
from spacy.pipeline._parser_internals import nonproj
|
||||
|
||||
from ..util import get_doc
|
||||
|
|
|
@ -75,19 +75,18 @@ def test_attributeruler_init(nlp, pattern_dicts):
|
|||
|
||||
def test_attributeruler_init_patterns(nlp, pattern_dicts):
|
||||
# initialize with patterns
|
||||
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
|
||||
nlp.add_pipe("attribute_ruler", config={"pattern_dicts": pattern_dicts})
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
assert doc[3].lemma_ == "cat"
|
||||
assert doc[3].morph_ == "Case=Nom|Number=Sing"
|
||||
|
||||
nlp.remove_pipe("attribute_ruler")
|
||||
|
||||
# initialize with patterns from asset
|
||||
a = nlp.add_pipe("attribute_ruler", config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}})
|
||||
|
||||
nlp.add_pipe(
|
||||
"attribute_ruler",
|
||||
config={"pattern_dicts": {"@assets": "attribute_ruler_patterns"}},
|
||||
)
|
||||
doc = nlp("This is a test.")
|
||||
assert doc[2].lemma_ == "the"
|
||||
assert doc[2].morph_ == "Case=Nom|Number=Plur"
|
||||
|
|
|
@ -117,12 +117,15 @@ def test_kb_default(nlp):
|
|||
assert len(entity_linker.kb) == 0
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
assert entity_linker.kb.entity_vector_length == 64 # default value from pipeline.entity_linker
|
||||
# default value from pipeline.entity_linker
|
||||
assert entity_linker.kb.entity_vector_length == 64
|
||||
|
||||
|
||||
def test_kb_custom_length(nlp):
|
||||
"""Test that the default (empty) KB can be configured with a custom entity length"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={"kb": {"entity_vector_length": 35}})
|
||||
entity_linker = nlp.add_pipe(
|
||||
"entity_linker", config={"kb": {"entity_vector_length": 35}}
|
||||
)
|
||||
assert len(entity_linker.kb) == 0
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
|
|
|
@ -117,9 +117,7 @@ def test_overfitting_IO():
|
|||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(
|
||||
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
||||
)
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||
assert scores["cats_f"] == 1.0
|
||||
assert scores["cats_score"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
|
|
@ -88,14 +88,9 @@ def my_parser():
|
|||
width=321,
|
||||
rows=5432,
|
||||
also_embed_subwords=True,
|
||||
also_use_static_vectors=False
|
||||
also_use_static_vectors=False,
|
||||
),
|
||||
MaxoutWindowEncoder(
|
||||
width=321,
|
||||
window_size=3,
|
||||
maxout_pieces=4,
|
||||
depth=2
|
||||
)
|
||||
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
||||
)
|
||||
parser = build_tb_parser_model(
|
||||
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import spacy
|
||||
import pytest
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
|
|
|
@ -711,16 +711,18 @@ def test_alignment_different_texts():
|
|||
with pytest.raises(ValueError):
|
||||
Alignment.from_strings(other_tokens, spacy_tokens)
|
||||
|
||||
|
||||
def test_retokenized_docs(doc):
|
||||
a = doc.to_array(["TAG"])
|
||||
doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||
doc2 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a)
|
||||
example = Example(doc1, doc2)
|
||||
|
||||
assert example.get_aligned("ORTH", as_string=True) == ['Sarah', "'s", 'sister', 'flew', 'to', 'Silicon', 'Valley', 'via', 'London', '.']
|
||||
|
||||
# fmt: off
|
||||
expected1 = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
|
||||
expected2 = [None, "sister", "flew", "to", None, "via", "London", "."]
|
||||
# fmt: on
|
||||
assert example.get_aligned("ORTH", as_string=True) == expected1
|
||||
with doc1.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc1[0:2])
|
||||
retokenizer.merge(doc1[5:7])
|
||||
|
||||
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
|
||||
assert example.get_aligned("ORTH", as_string=True) == expected2
|
||||
|
|
|
@ -24,6 +24,7 @@ def get_textcat_kwargs():
|
|||
"nO": 7,
|
||||
}
|
||||
|
||||
|
||||
def get_textcat_cnn_kwargs():
|
||||
return {
|
||||
"tok2vec": test_tok2vec(),
|
||||
|
@ -31,6 +32,7 @@ def get_textcat_cnn_kwargs():
|
|||
"nO": 13,
|
||||
}
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
params = []
|
||||
for node in model.walk():
|
||||
|
@ -59,17 +61,11 @@ def get_tok2vec_kwargs():
|
|||
# This actually creates models, so seems best to put it in a function.
|
||||
return {
|
||||
"embed": MultiHashEmbed(
|
||||
width=32,
|
||||
rows=500,
|
||||
also_embed_subwords=True,
|
||||
also_use_static_vectors=False
|
||||
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
||||
),
|
||||
"encode": MaxoutWindowEncoder(
|
||||
width=32,
|
||||
depth=2,
|
||||
maxout_pieces=2,
|
||||
window_size=1,
|
||||
)
|
||||
width=32, depth=2, maxout_pieces=2, window_size=1,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,14 +19,9 @@ def test_empty_doc():
|
|||
width=width,
|
||||
rows=embed_size,
|
||||
also_use_static_vectors=False,
|
||||
also_embed_subwords=True
|
||||
also_embed_subwords=True,
|
||||
),
|
||||
MaxoutWindowEncoder(
|
||||
width=width,
|
||||
depth=4,
|
||||
window_size=1,
|
||||
maxout_pieces=3
|
||||
)
|
||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||
)
|
||||
tok2vec.initialize()
|
||||
vectors, backprop = tok2vec.begin_update([doc])
|
||||
|
@ -44,14 +39,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
|||
width=width,
|
||||
rows=embed_size,
|
||||
also_use_static_vectors=False,
|
||||
also_embed_subwords=True
|
||||
also_embed_subwords=True,
|
||||
),
|
||||
MaxoutWindowEncoder(
|
||||
width=width,
|
||||
depth=4,
|
||||
window_size=1,
|
||||
maxout_pieces=3,
|
||||
)
|
||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
|
||||
)
|
||||
tok2vec.initialize()
|
||||
vectors, backprop = tok2vec.begin_update(batch)
|
||||
|
|
|
@ -85,27 +85,24 @@ def test_util_dot_section():
|
|||
"""
|
||||
nlp_config = Config().from_str(cfg_string)
|
||||
en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True)
|
||||
|
||||
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||
default_config["nlp"]["lang"] = "nl"
|
||||
nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True)
|
||||
|
||||
# Test that creation went OK
|
||||
assert isinstance(en_nlp, English)
|
||||
assert isinstance(nl_nlp, Dutch)
|
||||
assert nl_nlp.pipe_names == []
|
||||
assert en_nlp.pipe_names == ["textcat"]
|
||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes
|
||||
|
||||
# not exclusive_classes
|
||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||
# Test that default values got overwritten
|
||||
assert not en_config["nlp"]["load_vocab_data"]
|
||||
assert nl_config["nlp"]["load_vocab_data"] # default value True
|
||||
|
||||
# Test proper functioning of 'dot_to_object'
|
||||
with pytest.raises(KeyError):
|
||||
obj = dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
dot_to_object(en_config, "nlp.pipeline.tagger")
|
||||
with pytest.raises(KeyError):
|
||||
obj = dot_to_object(en_config, "nlp.unknownattribute")
|
||||
dot_to_object(en_config, "nlp.unknownattribute")
|
||||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple
|
||||
from typing import Iterator, Type, Pattern, Sequence, TYPE_CHECKING
|
||||
from typing import Iterator, Type, Pattern, TYPE_CHECKING
|
||||
from types import ModuleType
|
||||
import os
|
||||
import importlib
|
||||
|
@ -764,7 +764,6 @@ def normalize_slice(
|
|||
return start, stop
|
||||
|
||||
|
||||
|
||||
def filter_spans(spans: Iterable["Span"]) -> List["Span"]:
|
||||
"""Filter a sequence of spans and remove duplicates or overlaps. Useful for
|
||||
creating named entities (where one token can only be part of one entity) or
|
||||
|
@ -1113,6 +1112,3 @@ def minibatch(items, size):
|
|||
if len(batch) == 0:
|
||||
break
|
||||
yield list(batch)
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue