diff --git a/.gitignore b/.gitignore index 14097dfcd..6afa40f38 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,6 @@ Desktop.ini # Other *.tgz + +# Pycharm project files +*.idea diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py index 1544e1d5e..4e5640f0d 100644 --- a/examples/vectors_fast_text.py +++ b/examples/vectors_fast_text.py @@ -29,7 +29,7 @@ def main(vectors_loc, lang=None): nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: - line = line.decode('utf8') + line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', nr_dim) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py index 49c8ef6ab..402dc3c91 100644 --- a/spacy/lang/en/norm_exceptions.py +++ b/spacy/lang/en/norm_exceptions.py @@ -459,6 +459,8 @@ _exc = { "disorganised": "disorganized", "distil": "distill", "distils": "distills", + "doin": "doing", + "doin'": "doing", "dramatisation": "dramatization", "dramatisations": "dramatizations", "dramatise": "dramatize", @@ -687,6 +689,8 @@ _exc = { "globalises": "globalizes", "globalising": "globalizing", "glueing ": "gluing ", + "goin": "going", + "goin'":"going", "goitre": "goiter", "goitres": "goiters", "gonorrhoea": "gonorrhea", @@ -733,6 +737,8 @@ _exc = { "harmonised": "harmonized", "harmonises": "harmonizes", "harmonising": "harmonizing", + "havin": "having", + "havin'": "having", "homoeopath": "homeopath", "homoeopathic": "homeopathic", "homoeopaths": "homeopaths", @@ -924,6 +930,8 @@ _exc = { "localised": "localized", "localises": "localizes", "localising": "localizing", + "lovin": "loving", + "lovin'": "loving", "louvre": "louver", "louvred": "louvered", "louvres": "louvers ", diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index e870307af..064b7ea59 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -387,6 +387,21 @@ for exc_data in [ {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"}, {ORTH: "lovin'", LEMMA: "love", NORM: "loving"}, {ORTH: "Lovin'", LEMMA: "love", NORM: "loving"}, + {ORTH: "lovin", LEMMA: "love", NORM: "loving"}, + {ORTH: "Lovin", LEMMA: "love", NORM: "loving"}, + {ORTH: "havin'", LEMMA: "have", NORM: "having"}, + {ORTH: "Havin'", LEMMA: "have", NORM: "having"}, + {ORTH: "havin", LEMMA: "have", NORM: "having"}, + {ORTH: "Havin", LEMMA: "have", NORM: "having"}, + {ORTH: "doin'", LEMMA: "do", NORM: "doing"}, + {ORTH: "Doin'", LEMMA: "do", NORM: "doing"}, + {ORTH: "doin", LEMMA: "do", NORM: "doing"}, + {ORTH: "Doin", LEMMA: "do", NORM: "doing"}, + {ORTH: "goin'", LEMMA: "go", NORM: "going"}, + {ORTH: "Goin'", LEMMA: "go", NORM: "going"}, + {ORTH: "goin", LEMMA: "go", NORM: "going"}, + {ORTH: "Goin", LEMMA: "go", NORM: "going"}, + {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"}, {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"}, diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 04cc013a4..3b67c5489 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -21,8 +21,25 @@ class JapaneseTokenizer(object): words = [x.surface for x in self.tokenizer.tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) + # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to + # allow serialization (see #1557) + def to_bytes(self, **exclude): + return b'' + + def from_bytes(self, bytes_data, **exclude): + return self + + def to_disk(self, path, **exclude): + return None + + def from_disk(self, path, **exclude): + return self + class JapaneseDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ja' + @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) diff --git a/website/api/cli.jade b/website/api/cli.jade index b0913e60c..cb98ec279 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -257,7 +257,7 @@ p +row +cell #[code dev_data] +cell positional - +cell Location of JSON-formatted dev data (optional). + +cell Location of JSON-formatted development data for evaluation. +row +cell #[code --n-iter], #[code -n] diff --git a/website/api/span.jade b/website/api/span.jade index 266518076..dcfd49140 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -562,7 +562,7 @@ p +cell #[code orth_] +cell unicode +cell - | Verbatim text content (identical to #[code Span.text]). Existst + | Verbatim text content (identical to #[code Span.text]). Exists | mostly for consistency with the other attributes. +row