diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 4acda30ba..67f97f632 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -88,12 +88,21 @@ def convert( msg.info("Auto-detected sentence-per-line NER format") converter = converter_autodetect else: - msg.warn("Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert") + msg.warn( + "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" + ) if converter not in CONVERTERS: msg.fail("Can't find converter for {}".format(converter), exits=1) # Use converter function to convert data func = CONVERTERS[converter] - data = func(input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model) + data = func( + input_data, + n_sents=n_sents, + seg_sents=seg_sents, + use_morphology=morphology, + lang=lang, + model=model, + ) if output_dir != "-": # Export data to a file suffix = ".{}".format(file_type) @@ -104,7 +113,9 @@ def convert( srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) - msg.good("Generated output file ({} documents): {}".format(len(data), output_file)) + msg.good( + "Generated output file ({} documents): {}".format(len(data), output_file) + ) else: # Print to stdout if file_type == "json": diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index 97ef8e0f4..a3a37d6c9 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -38,32 +38,50 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs doc_delimiter = "-DOCSTART- -X- O O" # check for existing delimiters, which should be preserved if "\n\n" in input_data and seg_sents: - msg.warn("Sentence boundaries found, automatic sentence segmentation with `-s` disabled.") + msg.warn( + "Sentence boundaries found, automatic sentence segmentation with " + "`-s` disabled." + ) seg_sents = False if doc_delimiter in input_data and n_sents: - msg.warn("Document delimiters found, automatic document segmentation with `-n` disabled.") + msg.warn( + "Document delimiters found, automatic document segmentation with " + "`-n` disabled." + ) n_sents = 0 # do document segmentation with existing sentences - if "\n\n" in input_data and not doc_delimiter in input_data and n_sents: + if "\n\n" in input_data and doc_delimiter not in input_data and n_sents: n_sents_info(msg, n_sents) input_data = segment_docs(input_data, n_sents, doc_delimiter) # do sentence segmentation with existing documents - if not "\n\n" in input_data and doc_delimiter in input_data and seg_sents: + if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents: input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg) # do both sentence segmentation and document segmentation according # to options - if not "\n\n" in input_data and not doc_delimiter in input_data: + if "\n\n" not in input_data and doc_delimiter not in input_data: # sentence segmentation required for document segmentation if n_sents > 0 and not seg_sents: - msg.warn("No sentence boundaries found to use with option `-n {}`. Use `-s` to automatically segment sentences or `-n 0` to disable.".format(n_sents)) + msg.warn( + "No sentence boundaries found to use with option `-n {}`. " + "Use `-s` to automatically segment sentences or `-n 0` " + "to disable.".format(n_sents) + ) else: n_sents_info(msg, n_sents) - input_data = segment_sents_and_docs(input_data, n_sents, doc_delimiter, model=model, msg=msg) + input_data = segment_sents_and_docs( + input_data, n_sents, doc_delimiter, model=model, msg=msg + ) # provide warnings for problematic data - if not "\n\n" in input_data: - msg.warn("No sentence boundaries found. Use `-s` to automatically segment sentences.") - if not doc_delimiter in input_data: - msg.warn("No document delimiters found. Use `-n` to automatically group sentences into documents.") + if "\n\n" not in input_data: + msg.warn( + "No sentence boundaries found. Use `-s` to automatically segment " + "sentences." + ) + if doc_delimiter not in input_data: + msg.warn( + "No document delimiters found. Use `-n` to automatically group " + "sentences into documents." + ) output_docs = [] for doc in input_data.strip().split(doc_delimiter): doc = doc.strip() @@ -78,8 +96,10 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: raise ValueError( - "The token-per-line NER file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" - ) + "The token-per-line NER file is not formatted correctly. " + "Try checking whitespace and delimiters. See " + "https://spacy.io/api/cli#convert" + ) words = cols[0] iob_ents = cols[-1] if len(cols) > 2: @@ -110,7 +130,10 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): msg.info("Segmenting sentences with parser from model '{}'.".format(model)) sentencizer = nlp.get_pipe("parser") if not sentencizer: - msg.info("Segmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)") + msg.info( + "Segmenting sentences with sentencizer. (Use `-b model` for " + "improved parser-based sentence segmentation.)" + ) nlp = MultiLanguage() sentencizer = nlp.create_pipe("sentencizer") lines = doc.strip().split("\n") @@ -132,7 +155,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): def segment_docs(input_data, n_sents, doc_delimiter): sent_delimiter = "\n\n" sents = input_data.split(sent_delimiter) - docs = [sents[i:i+n_sents] for i in range(0, len(sents), n_sents)] + docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)] input_data = "" for doc in docs: input_data += sent_delimiter + doc_delimiter @@ -143,4 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter): def n_sents_info(msg, n_sents): msg.info("Grouping every {} sentences into a document.".format(n_sents)) if n_sents == 1: - msg.warn("To generate better training data, you may want to group sentences into documents with `-n 10`.") + msg.warn( + "To generate better training data, you may want to group " + "sentences into documents with `-n 10`." + ) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 826609bc4..fabf2ae26 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -34,7 +34,7 @@ def read_iob(raw_sents): for line in raw_sents: if not line.strip(): continue - tokens = [t.split('|') for t in line.split()] + tokens = [t.split("|") for t in line.split()] if len(tokens[0]) == 3: words, pos, iob = zip(*tokens) elif len(tokens[0]) == 2: diff --git a/spacy/language.py b/spacy/language.py index 86acf0257..10381573d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -38,8 +38,8 @@ from . import about class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): - lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups) - return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) + rules, index, exc, lookup = util.get_lemma_tables(lookups) + return Lemmatizer(index, exc, rules, lookup) @classmethod def create_lookups(cls, nlp=None): diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 3c2127c31..471e2b7b5 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -89,10 +89,7 @@ TOKEN_PATTERN_SCHEMA = { "title": "Fine-grained part-of-speech tag", "$ref": "#/definitions/string_value", }, - "DEP": { - "title": "Dependency label", - "$ref": "#/definitions/string_value" - }, + "DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"}, "LEMMA": { "title": "Lemma (base form)", "$ref": "#/definitions/string_value", diff --git a/spacy/tests/lang/sr/test_еxceptions.py b/spacy/tests/lang/sr/test_еxceptions.py index 136c995ab..285e99996 100644 --- a/spacy/tests/lang/sr/test_еxceptions.py +++ b/spacy/tests/lang/sr/test_еxceptions.py @@ -6,8 +6,13 @@ import pytest @pytest.mark.parametrize( "text,norms,lemmas", - [("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]), - ("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])]) + [ + ("о.г.", ["ове године"], ["ова година"]), + ("чет.", ["четвртак"], ["четвртак"]), + ("гђа", ["госпођа"], ["госпођа"]), + ("ил'", ["или"], ["или"]), + ], +) def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): tokens = sr_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index ccbc7c57e..df35a1be2 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -394,7 +394,7 @@ def test_attr_pipeline_checks(en_vocab): ([{"IS_PUNCT": True}], "."), ([{"IS_SPACE": True}], "\n"), ([{"IS_BRACKET": True}], "["), - ([{"IS_QUOTE": True}], "\""), + ([{"IS_QUOTE": True}], '"'), ([{"IS_LEFT_PUNCT": True}], "``"), ([{"IS_RIGHT_PUNCT": True}], "''"), ([{"IS_STOP": True}], "the"), @@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab): ) def test_matcher_schema_token_attributes(en_vocab, pattern, text): matcher = Matcher(en_vocab) - doc = Doc(en_vocab, words=text.split(' ')) + doc = Doc(en_vocab, words=text.split(" ")) matcher.add("Rule", None, pattern) assert len(matcher) == 1 matches = matcher(doc) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3ea9d1b0c..6dce649a9 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -49,8 +49,10 @@ def test_cli_converters_iob2json(): sent = converted[0]["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] + # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] + # fmt: on def test_cli_converters_conll_ner2json(): @@ -113,8 +115,10 @@ def test_cli_converters_conll_ner2json(): sent = converted[0]["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] + # fmt: off assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"] + # fmt: on def test_pretrain_make_docs():