Merge branch 'master' of https://github.com/explosion/spaCy

2017-11-15 13:11:43 +01:00 · 2017-11-15 13:11:43 +01:00 · b797dca977
parent 86ddf692a1 9177c7d7aa
commit b797dca977
7 changed files with 46 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -97,3 +97,6 @@ Desktop.ini

 # Other
 *.tgz
+
+# Pycharm project files
+*.idea
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@ -29,7 +29,7 @@ def main(vectors_loc, lang=None):
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
-            line = line.decode('utf8')
+            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', nr_dim)
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
--- a/spacy/lang/en/norm_exceptions.py
+++ b/spacy/lang/en/norm_exceptions.py
@ -459,6 +459,8 @@ _exc = {
    "disorganised": "disorganized",
    "distil": "distill",
    "distils": "distills",
+    "doin": "doing",
+    "doin'": "doing",
    "dramatisation": "dramatization",
    "dramatisations": "dramatizations",
    "dramatise": "dramatize",
@ -687,6 +689,8 @@ _exc = {
    "globalises": "globalizes",
    "globalising": "globalizing",
    "glueing ": "gluing ",
+    "goin": "going",
+    "goin'":"going",
    "goitre": "goiter",
    "goitres": "goiters",
    "gonorrhoea": "gonorrhea",
@ -733,6 +737,8 @@ _exc = {
    "harmonised": "harmonized",
    "harmonises": "harmonizes",
    "harmonising": "harmonizing",
+    "havin": "having",
+    "havin'": "having",
    "homoeopath": "homeopath",
    "homoeopathic": "homeopathic",
    "homoeopaths": "homeopaths",
@ -924,6 +930,8 @@ _exc = {
    "localised": "localized",
    "localises": "localizes",
    "localising": "localizing",
+    "lovin": "loving",
+    "lovin'": "loving",
    "louvre": "louver",
    "louvred": "louvered",
    "louvres": "louvers ",
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -387,6 +387,21 @@ for exc_data in [
    {ORTH: "O'clock", LEMMA: "o'clock", NORM: "o'clock"},
    {ORTH: "lovin'", LEMMA: "love", NORM: "loving"},
    {ORTH: "Lovin'", LEMMA: "love", NORM: "loving"},
+    {ORTH: "lovin", LEMMA: "love", NORM: "loving"},
+    {ORTH: "Lovin", LEMMA: "love", NORM: "loving"},
+    {ORTH: "havin'", LEMMA: "have", NORM: "having"},
+    {ORTH: "Havin'", LEMMA: "have", NORM: "having"},
+    {ORTH: "havin", LEMMA: "have", NORM: "having"},
+    {ORTH: "Havin", LEMMA: "have", NORM: "having"},
+    {ORTH: "doin'", LEMMA: "do", NORM: "doing"},
+    {ORTH: "Doin'", LEMMA: "do", NORM: "doing"},
+    {ORTH: "doin", LEMMA: "do", NORM: "doing"},
+    {ORTH: "Doin", LEMMA: "do", NORM: "doing"},
+    {ORTH: "goin'", LEMMA: "go", NORM: "going"},
+    {ORTH: "Goin'", LEMMA: "go", NORM: "going"},
+    {ORTH: "goin", LEMMA: "go", NORM: "going"},
+    {ORTH: "Goin", LEMMA: "go", NORM: "going"},
+

    {ORTH: "Mt.", LEMMA: "Mount", NORM: "Mount"},
    {ORTH: "Ak.", LEMMA: "Alaska", NORM: "Alaska"},
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -21,8 +21,25 @@ class JapaneseTokenizer(object):
        words = [x.surface for x in self.tokenizer.tokenize(text)]
        return Doc(self.vocab, words=words, spaces=[False]*len(words))

+    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
+    # allow serialization (see #1557)
+    def to_bytes(self, **exclude):
+        return b''
+
+    def from_bytes(self, bytes_data, **exclude):
+        return self
+
+    def to_disk(self, path, **exclude):
+        return None
+
+    def from_disk(self, path, **exclude):
+        return self
+

 class JapaneseDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'ja'
+
    @classmethod
    def create_tokenizer(cls, nlp=None):
        return JapaneseTokenizer(cls, nlp)
--- a/website/api/cli.jade
+++ b/website/api/cli.jade
@ -257,7 +257,7 @@ p
    +row
        +cell #[code dev_data]
        +cell positional
-        +cell Location of JSON-formatted dev data (optional).
+        +cell Location of JSON-formatted development data for evaluation.

    +row
        +cell #[code --n-iter], #[code -n]
--- a/website/api/span.jade
+++ b/website/api/span.jade
@ -562,7 +562,7 @@ p
        +cell #[code orth_]
        +cell unicode
        +cell
-            |  Verbatim text content (identical to #[code Span.text]). Existst
+            |  Verbatim text content (identical to #[code Span.text]). Exists
            |  mostly for consistency with the other attributes.

    +row