mirror of https://github.com/explosion/spaCy.git
Update UD bin scripts
* Update imports for `bin/` * Add all currently supported languages * Update subtok merger for new Matcher validation * Modify blinded check to look at tokens instead of lemmas (for corpora with tokens but not lemmas like Telugu)
This commit is contained in:
parent
04d36d2471
commit
ede32c01e2
|
@ -7,14 +7,16 @@ import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
from spacy.cli.ud import conll17_ud_eval
|
import conll17_ud_eval
|
||||||
from spacy.cli.ud.ud_train import write_conllu
|
from ud_train import write_conllu
|
||||||
from spacy.lang.lex_attrs import word_shape
|
from spacy.lang.lex_attrs import word_shape
|
||||||
from spacy.util import get_lang_class
|
from spacy.util import get_lang_class
|
||||||
|
|
||||||
# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
|
# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
|
||||||
ALL_LANGUAGES = "ar, ca, da, de, el, en, es, fa, fi, fr, ga, he, hi, hr, hu, id, " \
|
ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
|
||||||
"it, ja, no, nl, pl, pt, ro, ru, sv, tr, ur, vi, zh"
|
"ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
|
||||||
|
"nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
|
||||||
|
"tr, tt, uk, ur, vi, zh")
|
||||||
|
|
||||||
# Non-parsing tasks that will be evaluated (works for default models)
|
# Non-parsing tasks that will be evaluated (works for default models)
|
||||||
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
|
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
|
||||||
|
@ -73,10 +75,10 @@ def _contains_blinded_text(stats_xml):
|
||||||
tree = ET.parse(stats_xml)
|
tree = ET.parse(stats_xml)
|
||||||
root = tree.getroot()
|
root = tree.getroot()
|
||||||
total_tokens = int(root.find('size/total/tokens').text)
|
total_tokens = int(root.find('size/total/tokens').text)
|
||||||
unique_lemmas = int(root.find('lemmas').get('unique'))
|
unique_forms = int(root.find('forms').get('unique'))
|
||||||
|
|
||||||
# assume the corpus is largely blinded when there are less than 1% unique tokens
|
# assume the corpus is largely blinded when there are less than 1% unique tokens
|
||||||
return (unique_lemmas / total_tokens) < 0.01
|
return (unique_forms / total_tokens) < 0.01
|
||||||
|
|
||||||
|
|
||||||
def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language):
|
def fetch_all_treebanks(ud_dir, languages, corpus, best_per_language):
|
||||||
|
@ -262,22 +264,26 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train
|
||||||
if not exclude_trained_models:
|
if not exclude_trained_models:
|
||||||
if 'de' in models:
|
if 'de' in models:
|
||||||
models['de'].append(load_model('de_core_news_sm'))
|
models['de'].append(load_model('de_core_news_sm'))
|
||||||
if 'es' in models:
|
models['de'].append(load_model('de_core_news_md'))
|
||||||
models['es'].append(load_model('es_core_news_sm'))
|
if 'el' in models:
|
||||||
models['es'].append(load_model('es_core_news_md'))
|
models['el'].append(load_model('el_core_news_sm'))
|
||||||
if 'pt' in models:
|
models['el'].append(load_model('el_core_news_md'))
|
||||||
models['pt'].append(load_model('pt_core_news_sm'))
|
|
||||||
if 'it' in models:
|
|
||||||
models['it'].append(load_model('it_core_news_sm'))
|
|
||||||
if 'nl' in models:
|
|
||||||
models['nl'].append(load_model('nl_core_news_sm'))
|
|
||||||
if 'en' in models:
|
if 'en' in models:
|
||||||
models['en'].append(load_model('en_core_web_sm'))
|
models['en'].append(load_model('en_core_web_sm'))
|
||||||
models['en'].append(load_model('en_core_web_md'))
|
models['en'].append(load_model('en_core_web_md'))
|
||||||
models['en'].append(load_model('en_core_web_lg'))
|
models['en'].append(load_model('en_core_web_lg'))
|
||||||
|
if 'es' in models:
|
||||||
|
models['es'].append(load_model('es_core_news_sm'))
|
||||||
|
models['es'].append(load_model('es_core_news_md'))
|
||||||
if 'fr' in models:
|
if 'fr' in models:
|
||||||
models['fr'].append(load_model('fr_core_news_sm'))
|
models['fr'].append(load_model('fr_core_news_sm'))
|
||||||
models['fr'].append(load_model('fr_core_news_md'))
|
models['fr'].append(load_model('fr_core_news_md'))
|
||||||
|
if 'it' in models:
|
||||||
|
models['it'].append(load_model('it_core_news_sm'))
|
||||||
|
if 'nl' in models:
|
||||||
|
models['nl'].append(load_model('nl_core_news_sm'))
|
||||||
|
if 'pt' in models:
|
||||||
|
models['pt'].append(load_model('pt_core_news_sm'))
|
||||||
|
|
||||||
with out_path.open(mode='w', encoding='utf-8') as out_file:
|
with out_path.open(mode='w', encoding='utf-8') as out_file:
|
||||||
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
|
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
|
||||||
|
|
|
@ -109,15 +109,13 @@ def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
matches = []
|
||||||
|
if doc.is_parsed:
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
for span in spans:
|
for span in spans:
|
||||||
retokenizer.merge(span)
|
retokenizer.merge(span)
|
||||||
# TODO: This shouldn't be necessary? Should be handled in merge
|
|
||||||
for word in doc:
|
|
||||||
if word.i == word.head.i:
|
|
||||||
word.dep_ = "ROOT"
|
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
|
|
|
@ -25,7 +25,7 @@ import itertools
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
import conll17_ud_eval
|
||||||
|
|
||||||
from spacy import lang
|
from spacy import lang
|
||||||
from spacy.lang import zh
|
from spacy.lang import zh
|
||||||
|
@ -214,6 +214,8 @@ def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
matches = []
|
||||||
|
if doc.is_parsed:
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
@ -298,9 +300,9 @@ def get_token_conllu(token, i):
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
|
||||||
Token.set_extension("begins_fused", default=False)
|
Token.set_extension("begins_fused", default=False, force=True)
|
||||||
Token.set_extension("inside_fused", default=False)
|
Token.set_extension("inside_fused", default=False, force=True)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
|
|
Loading…
Reference in New Issue