mirror of https://github.com/explosion/spaCy.git
Tidy up and auto-format [ci skip]
This commit is contained in:
parent
bcd1b12f43
commit
cd90752193
|
@ -88,12 +88,21 @@ def convert(
|
|||
msg.info("Auto-detected sentence-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
else:
|
||||
msg.warn("Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert")
|
||||
msg.warn(
|
||||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
data = func(input_data, n_sents=n_sents, seg_sents=seg_sents, use_morphology=morphology, lang=lang, model=model)
|
||||
data = func(
|
||||
input_data,
|
||||
n_sents=n_sents,
|
||||
seg_sents=seg_sents,
|
||||
use_morphology=morphology,
|
||||
lang=lang,
|
||||
model=model,
|
||||
)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
|
@ -104,7 +113,9 @@ def convert(
|
|||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good("Generated output file ({} documents): {}".format(len(data), output_file))
|
||||
msg.good(
|
||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
||||
)
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
|
|
|
@ -38,32 +38,50 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
|
|||
doc_delimiter = "-DOCSTART- -X- O O"
|
||||
# check for existing delimiters, which should be preserved
|
||||
if "\n\n" in input_data and seg_sents:
|
||||
msg.warn("Sentence boundaries found, automatic sentence segmentation with `-s` disabled.")
|
||||
msg.warn(
|
||||
"Sentence boundaries found, automatic sentence segmentation with "
|
||||
"`-s` disabled."
|
||||
)
|
||||
seg_sents = False
|
||||
if doc_delimiter in input_data and n_sents:
|
||||
msg.warn("Document delimiters found, automatic document segmentation with `-n` disabled.")
|
||||
msg.warn(
|
||||
"Document delimiters found, automatic document segmentation with "
|
||||
"`-n` disabled."
|
||||
)
|
||||
n_sents = 0
|
||||
# do document segmentation with existing sentences
|
||||
if "\n\n" in input_data and not doc_delimiter in input_data and n_sents:
|
||||
if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
|
||||
n_sents_info(msg, n_sents)
|
||||
input_data = segment_docs(input_data, n_sents, doc_delimiter)
|
||||
# do sentence segmentation with existing documents
|
||||
if not "\n\n" in input_data and doc_delimiter in input_data and seg_sents:
|
||||
if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
|
||||
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
|
||||
# do both sentence segmentation and document segmentation according
|
||||
# to options
|
||||
if not "\n\n" in input_data and not doc_delimiter in input_data:
|
||||
if "\n\n" not in input_data and doc_delimiter not in input_data:
|
||||
# sentence segmentation required for document segmentation
|
||||
if n_sents > 0 and not seg_sents:
|
||||
msg.warn("No sentence boundaries found to use with option `-n {}`. Use `-s` to automatically segment sentences or `-n 0` to disable.".format(n_sents))
|
||||
msg.warn(
|
||||
"No sentence boundaries found to use with option `-n {}`. "
|
||||
"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
"to disable.".format(n_sents)
|
||||
)
|
||||
else:
|
||||
n_sents_info(msg, n_sents)
|
||||
input_data = segment_sents_and_docs(input_data, n_sents, doc_delimiter, model=model, msg=msg)
|
||||
input_data = segment_sents_and_docs(
|
||||
input_data, n_sents, doc_delimiter, model=model, msg=msg
|
||||
)
|
||||
# provide warnings for problematic data
|
||||
if not "\n\n" in input_data:
|
||||
msg.warn("No sentence boundaries found. Use `-s` to automatically segment sentences.")
|
||||
if not doc_delimiter in input_data:
|
||||
msg.warn("No document delimiters found. Use `-n` to automatically group sentences into documents.")
|
||||
if "\n\n" not in input_data:
|
||||
msg.warn(
|
||||
"No sentence boundaries found. Use `-s` to automatically segment "
|
||||
"sentences."
|
||||
)
|
||||
if doc_delimiter not in input_data:
|
||||
msg.warn(
|
||||
"No document delimiters found. Use `-n` to automatically group "
|
||||
"sentences into documents."
|
||||
)
|
||||
output_docs = []
|
||||
for doc in input_data.strip().split(doc_delimiter):
|
||||
doc = doc.strip()
|
||||
|
@ -78,8 +96,10 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
|
|||
cols = list(zip(*[line.split() for line in lines]))
|
||||
if len(cols) < 2:
|
||||
raise ValueError(
|
||||
"The token-per-line NER file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
"The token-per-line NER file is not formatted correctly. "
|
||||
"Try checking whitespace and delimiters. See "
|
||||
"https://spacy.io/api/cli#convert"
|
||||
)
|
||||
words = cols[0]
|
||||
iob_ents = cols[-1]
|
||||
if len(cols) > 2:
|
||||
|
@ -110,7 +130,10 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
||||
sentencizer = nlp.get_pipe("parser")
|
||||
if not sentencizer:
|
||||
msg.info("Segmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)")
|
||||
msg.info(
|
||||
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
||||
"improved parser-based sentence segmentation.)"
|
||||
)
|
||||
nlp = MultiLanguage()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
lines = doc.strip().split("\n")
|
||||
|
@ -132,7 +155,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
def segment_docs(input_data, n_sents, doc_delimiter):
|
||||
sent_delimiter = "\n\n"
|
||||
sents = input_data.split(sent_delimiter)
|
||||
docs = [sents[i:i+n_sents] for i in range(0, len(sents), n_sents)]
|
||||
docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
|
||||
input_data = ""
|
||||
for doc in docs:
|
||||
input_data += sent_delimiter + doc_delimiter
|
||||
|
@ -143,4 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
|
|||
def n_sents_info(msg, n_sents):
|
||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
||||
if n_sents == 1:
|
||||
msg.warn("To generate better training data, you may want to group sentences into documents with `-n 10`.")
|
||||
msg.warn(
|
||||
"To generate better training data, you may want to group "
|
||||
"sentences into documents with `-n 10`."
|
||||
)
|
||||
|
|
|
@ -34,7 +34,7 @@ def read_iob(raw_sents):
|
|||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split('|') for t in line.split()]
|
||||
tokens = [t.split("|") for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
elif len(tokens[0]) == 2:
|
||||
|
|
|
@ -38,8 +38,8 @@ from . import about
|
|||
class BaseDefaults(object):
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups)
|
||||
return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
rules, index, exc, lookup = util.get_lemma_tables(lookups)
|
||||
return Lemmatizer(index, exc, rules, lookup)
|
||||
|
||||
@classmethod
|
||||
def create_lookups(cls, nlp=None):
|
||||
|
|
|
@ -89,10 +89,7 @@ TOKEN_PATTERN_SCHEMA = {
|
|||
"title": "Fine-grained part-of-speech tag",
|
||||
"$ref": "#/definitions/string_value",
|
||||
},
|
||||
"DEP": {
|
||||
"title": "Dependency label",
|
||||
"$ref": "#/definitions/string_value"
|
||||
},
|
||||
"DEP": {"title": "Dependency label", "$ref": "#/definitions/string_value"},
|
||||
"LEMMA": {
|
||||
"title": "Lemma (base form)",
|
||||
"$ref": "#/definitions/string_value",
|
||||
|
|
|
@ -6,8 +6,13 @@ import pytest
|
|||
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms,lemmas",
|
||||
[("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]),
|
||||
("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])])
|
||||
[
|
||||
("о.г.", ["ове године"], ["ова година"]),
|
||||
("чет.", ["четвртак"], ["четвртак"]),
|
||||
("гђа", ["госпођа"], ["госпођа"]),
|
||||
("ил'", ["или"], ["или"]),
|
||||
],
|
||||
)
|
||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
||||
tokens = sr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -394,7 +394,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
([{"IS_PUNCT": True}], "."),
|
||||
([{"IS_SPACE": True}], "\n"),
|
||||
([{"IS_BRACKET": True}], "["),
|
||||
([{"IS_QUOTE": True}], "\""),
|
||||
([{"IS_QUOTE": True}], '"'),
|
||||
([{"IS_LEFT_PUNCT": True}], "``"),
|
||||
([{"IS_RIGHT_PUNCT": True}], "''"),
|
||||
([{"IS_STOP": True}], "the"),
|
||||
|
@ -405,7 +405,7 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
)
|
||||
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=text.split(' '))
|
||||
doc = Doc(en_vocab, words=text.split(" "))
|
||||
matcher.add("Rule", None, pattern)
|
||||
assert len(matcher) == 1
|
||||
matches = matcher(doc)
|
||||
|
|
|
@ -49,8 +49,10 @@ def test_cli_converters_iob2json():
|
|||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
||||
assert len(sent["tokens"]) == 8
|
||||
tokens = sent["tokens"]
|
||||
# fmt: off
|
||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_cli_converters_conll_ner2json():
|
||||
|
@ -113,8 +115,10 @@ def test_cli_converters_conll_ner2json():
|
|||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
||||
assert len(sent["tokens"]) == 8
|
||||
tokens = sent["tokens"]
|
||||
# fmt: off
|
||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_pretrain_make_docs():
|
||||
|
|
Loading…
Reference in New Issue