diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index c87f40680..afc4491cb 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals import plac import json -from os import path -import shutil -import os import random -import io import pathlib from spacy.tokens import Doc from spacy.syntax.nonproj import PseudoProjectivity from spacy.language import Language from spacy.gold import GoldParse -from spacy.vocab import Vocab from spacy.tagger import Tagger from spacy.pipeline import DependencyParser, BeamDependencyParser from spacy.syntax.parser import get_templates @@ -23,7 +18,6 @@ import spacy.attrs import io - def read_conllx(loc, n=0): with io.open(loc, 'r', encoding='utf8') as file_: text = file_.read() @@ -35,7 +29,8 @@ def read_conllx(loc, n=0): lines.pop(0) tokens = [] for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() + id_, word, lemma, pos, tag, morph, head, dep, _1, \ + _2 = line.split('\t') if '-' in id_ or '.' in id_: continue try: @@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) - nlp = Language(vocab=vocab, tagger=tagger, parser=parser) + nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index ceebe2e59..d5d4bf353 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -1,4 +1,4 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from libc.stdio cimport fopen, fclose, fread, fwrite from libc.string cimport memcpy diff --git a/spacy/en/morph_rules.py b/spacy/en/morph_rules.py index 2b8aae823..51a50736e 100644 --- a/spacy/en/morph_rules.py +++ b/spacy/en/morph_rules.py @@ -21,7 +21,6 @@ MORPH_RULES = { "them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, - "yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"}, "his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, "hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, "its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, diff --git a/spacy/fi/tokenizer_exceptions.py b/spacy/fi/tokenizer_exceptions.py index 52ea7428a..09775a2f4 100644 --- a/spacy/fi/tokenizer_exceptions.py +++ b/spacy/fi/tokenizer_exceptions.py @@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = { "vm.": [ {ORTH: "vm.", LEMMA: "viimeksi mainittu"} ], - "siht.": [ - {ORTH: "siht.", LEMMA: "sihteeri"} - ], "srk.": [ {ORTH: "srk.", LEMMA: "seurakunta"} ] diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 358412fab..471018109 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,16 +1,12 @@ # cython: profile=True from __future__ import unicode_literals, print_function -import numpy import io import json -import random import re import os from os import path -from libc.string cimport memset - import ujson as json from .syntax import nonproj diff --git a/spacy/language.py b/spacy/language.py index 573bb5a86..4542eae3b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,6 +1,5 @@ from __future__ import absolute_import from __future__ import unicode_literals -from warnings import warn import pathlib from contextlib import contextmanager import shutil @@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer -from .pipeline import BeamDependencyParser, BeamEntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 5c52ae9d0..1883ae89a 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -2,13 +2,10 @@ # cython: infer_types=True from __future__ import unicode_literals -from os import path - from .typedefs cimport attr_t from .typedefs cimport hash_t from .attrs cimport attr_id_t -from .structs cimport TokenC, LexemeC -from .lexeme cimport Lexeme +from .structs cimport TokenC from cymem.cymem cimport Pool from preshed.maps cimport PreshMap @@ -17,7 +14,7 @@ from libcpp.pair cimport pair from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport ID, ENT_TYPE from . import attrs from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 26405e988..e98ef1e92 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,12 +1,8 @@ # cython: infer_types from __future__ import unicode_literals -from os import path - from libc.string cimport memset -from .lemmatizer import Lemmatizer - try: import ujson as json except ImportError: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 59e1994a9..b2d622329 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -2,7 +2,6 @@ from .syntax.parser cimport Parser from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager -from .vocab cimport Vocab from .tagger import Tagger # TODO: The disorganization here is pretty embarrassing. At least it's only diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 1f6b587c5..4a2ef082a 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,20 +1,16 @@ import json import pathlib from collections import defaultdict -from libc.string cimport memset from cymem.cymem cimport Pool -from thinc.typedefs cimport atom_t, weight_t +from thinc.typedefs cimport atom_t from thinc.extra.eg cimport Example from thinc.structs cimport ExampleC from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec -from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG -from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON -from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .gold cimport GoldParse from .attrs cimport * diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5a4eb844a..42f090cde 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,13 +1,10 @@ # cython: embedsignature=True from __future__ import unicode_literals -import re import pathlib from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc -from cpython cimport Py_UNICODE_ISSPACE - try: import ujson as json diff --git a/spacy/util.py b/spacy/util.py index 1f1cdbb6e..893ba87c1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,11 +8,8 @@ import os.path import pathlib import sys -import six import textwrap -from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE - try: basestring