From 03fb498dbeb86e52b9b3e487ab8edfd836b53660 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@spacy.io>
Date: Thu, 10 Mar 2016 13:01:34 +0100
Subject: [PATCH] introduce lang field for LexemeC to hold language id put
 noun_chunk logic into iterators.py for each language separately

---
 bin/init_model.py                             |  4 +--
 setup.py                                      |  5 +--
 spacy/attrs.pxd                               | 20 +++++------
 spacy/attrs.pyx                               |  9 ++---
 .../{tokens/npchunks.pxd => de/iterators.pxd} |  0
 .../{tokens/npchunks.pyx => de/iterators.pyx} | 34 +++----------------
 spacy/en/iterators.pxd                        |  0
 spacy/en/iterators.pyx                        | 24 +++++++++++++
 spacy/language.py                             |  9 ++---
 spacy/lexeme.pxd                              |  6 +++-
 spacy/lexeme.pyx                              | 16 ++++++---
 spacy/orth.pyx                                |  6 ++--
 spacy/structs.pxd                             |  2 ++
 spacy/tokens/doc.pyx                          | 20 ++++++++---
 spacy/tokens/token.pyx                        | 16 ++++++---
 spacy/vocab.pyx                               |  2 ++
 16 files changed, 103 insertions(+), 70 deletions(-)
 rename spacy/{tokens/npchunks.pxd => de/iterators.pxd} (100%)
 rename spacy/{tokens/npchunks.pyx => de/iterators.pyx} (53%)
 create mode 100644 spacy/en/iterators.pxd
 create mode 100644 spacy/en/iterators.pyx

diff --git a/bin/init_model.py b/bin/init_model.py
index 19cfcdc25..5e62a7faf 100644
--- a/bin/init_model.py
+++ b/bin/init_model.py
@@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
     else:
         file_ = loc.open()
     for i, line in enumerate(file_):
-        freq, doc_freq, key = line.split('\t', 2)
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
         freq = int(freq)
         counts.inc(i+1, freq)
         total += freq
@@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
         file_ = loc.open()
     probs = {}
     for line in file_:
-        freq, doc_freq, key = line.split('\t', 2)
+        freq, doc_freq, key = line.rstrip().split('\t', 2)
         doc_freq = int(doc_freq)
         freq = int(freq)
         if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
diff --git a/setup.py b/setup.py
index de7d95d22..7449212b9 100644
--- a/setup.py
+++ b/setup.py
@@ -56,14 +56,15 @@ MOD_NAMES = [
     'spacy.tokens.doc',
     'spacy.tokens.span',
     'spacy.tokens.token',
-    'spacy.tokens.npchunks',
     'spacy.serialize.packer',
     'spacy.serialize.huffman',
     'spacy.serialize.bits',
     'spacy.cfile',
     'spacy.matcher',
     'spacy.syntax.ner',
-    'spacy.symbols']
+    'spacy.symbols',
+    'spacy.en.iterators',
+    'spacy.de.iterators']
 
 
 # By subclassing build_extensions we have the actual compiler that will be used
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 61a00ba1b..a878a49d8 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -14,12 +14,12 @@ cpdef enum attr_id_t:
     LIKE_EMAIL
     IS_STOP
     IS_OOV
-   
-    FLAG14 = 14
-    FLAG15
-    FLAG16
-    FLAG17
-    FLAG18
+    IS_BRACKET
+    IS_QUOTE
+    IS_LEFT_PUNCT
+    IS_RIGHT_PUNCT
+
+    FLAG18 = 18
     FLAG19
     FLAG20
     FLAG21
@@ -85,11 +85,7 @@ cpdef enum attr_id_t:
     HEAD
     SPACY
     PROB
+
+    LANG
     
-# Move these up to FLAG14--FLAG18 once we finish the functionality and
-# are ready to regenerate the model
-#IS_BRACKET
-#IS_QUOTE
-#IS_LEFT_PUNCT
-#IS_RIGHT_PUNCT
  
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 146f3ab26..9a191beda 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -13,10 +13,10 @@ IDS = {
     "LIKE_EMAIL": LIKE_EMAIL,
     "IS_STOP": IS_STOP,
     "IS_OOV": IS_OOV,
-    "FLAG14": FLAG14,
-    "FLAG15": FLAG15,
-    "FLAG16": FLAG16,
-    "FLAG17": FLAG17,
+    "IS_BRACKET": IS_BRACKET,
+    "IS_QUOTE": IS_QUOTE,
+    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
+    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
     "FLAG18": FLAG18,
     "FLAG19": FLAG19,
     "FLAG20": FLAG20,
@@ -83,6 +83,7 @@ IDS = {
     "HEAD": HEAD,
     "SPACY": SPACY,
     "PROB": PROB,
+    "LANG": LANG,
 }
 
 # ATTR IDs, in order of the symbol
diff --git a/spacy/tokens/npchunks.pxd b/spacy/de/iterators.pxd
similarity index 100%
rename from spacy/tokens/npchunks.pxd
rename to spacy/de/iterators.pxd
diff --git a/spacy/tokens/npchunks.pyx b/spacy/de/iterators.pyx
similarity index 53%
rename from spacy/tokens/npchunks.pyx
rename to spacy/de/iterators.pyx
index 0c5ca32a5..a6321bd57 100644
--- a/spacy/tokens/npchunks.pyx
+++ b/spacy/de/iterators.pyx
@@ -1,31 +1,9 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span
 
-from ..structs cimport TokenC
-from .doc cimport Doc
-from .span cimport Span
+from spacy.parts_of_speech cimport NOUN
 
-from ..parts_of_speech cimport NOUN, PROPN, PRON
-
-def english(Span sent):
-    cdef const TokenC* word
-    strings = sent.doc.vocab.strings
-    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
-    np_deps = [strings[label] for label in labels]
-    conj = strings['conj']
-    np_label = strings['NP']
-    for i in range(sent.start, sent.end):
-        word = &sent.doc.c[i]
-        if word.pos == NOUN and word.dep in np_deps:
-            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-        elif word.pos == NOUN and word.dep == conj:
-            head = word+word.head
-            while head.dep == conj and head.head < 0:
-                head += head.head
-            # If the head is an NP, and we're coordinated to it, we're an NP
-            if head.dep in np_deps:
-                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
-
-
-def german(Span sent):
+def noun_chunks(Span sent):
     # this function extracts spans headed by NOUNs starting from the left-most
     # syntactic dependent until the NOUN itself
     # for close apposition and measurement construction, the span is sometimes
@@ -48,7 +26,3 @@ def german(Span sent):
                 if rdep.pos == NOUN and rdep.dep == close_app:
                     rbracket = rdep.i+1
             yield Span(sent.doc, word.l_edge, rbracket, label=np_label)
-
-
-
-
diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx
new file mode 100644
index 000000000..e4f0fe2a4
--- /dev/null
+++ b/spacy/en/iterators.pyx
@@ -0,0 +1,24 @@
+from spacy.structs cimport TokenC
+from spacy.tokens.span cimport Span
+
+from spacy.parts_of_speech cimport NOUN
+
+def noun_chunks(Span sent):
+    cdef const TokenC* word
+    strings = sent.doc.vocab.strings
+    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root']
+    np_deps = [strings[label] for label in labels]
+    conj = strings['conj']
+    np_label = strings['NP']
+    for i in range(sent.start, sent.end):
+        word = &sent.doc.c[i]
+        if word.pos == NOUN and word.dep in np_deps:
+            yield Span(sent.doc, word.l_edge, i+1, label=np_label)
+        elif word.pos == NOUN and word.dep == conj:
+            head = word+word.head
+            while head.dep == conj and head.head < 0:
+                head += head.head
+            # If the head is an NP, and we're coordinated to it, we're an NP
+            if head.dep in np_deps:
+                yield Span(sent.doc, word.l_edge, i+1, label=np_label)
+
diff --git a/spacy/language.py b/spacy/language.py
index 4df34d956..f186c2f2b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -69,6 +69,7 @@ class Language(object):
             attrs.SUFFIX: cls.suffix,
             attrs.CLUSTER: cls.cluster,
             attrs.PROB: lambda string: oov_prob,
+            attrs.LANG: lambda string: cls.lang,
             attrs.IS_ALPHA: orth.is_alpha,
             attrs.IS_ASCII: orth.is_ascii,
             attrs.IS_DIGIT: cls.is_digit,
@@ -77,10 +78,10 @@ class Language(object):
             attrs.IS_SPACE: cls.is_space,
             attrs.IS_TITLE: orth.is_title,
             attrs.IS_UPPER: orth.is_upper,
-            attrs.FLAG14: orth.is_bracket,
-            attrs.FLAG15: orth.is_quote,
-            attrs.FLAG16: orth.is_left_punct,
-            attrs.FLAG17: orth.is_right_punct,
+            attrs.IS_BRACKET: orth.is_bracket,
+            attrs.IS_QUOTE: orth.is_quote,
+            attrs.IS_LEFT_PUNCT: orth.is_left_punct,
+            attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
             attrs.LIKE_URL: orth.like_url,
             attrs.LIKE_NUM: orth.like_number,
             attrs.LIKE_EMAIL: orth.like_email,
diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 6fc25efb6..12d4e3de3 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,6 +1,6 @@
 from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t
 from .attrs cimport attr_id_t
-from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG
 
 from .structs cimport LexemeC
 from .strings cimport StringStore
@@ -41,6 +41,8 @@ cdef class Lexeme:
             lex.suffix = value
         elif name == CLUSTER:
             lex.cluster = value
+        elif name == LANG:
+            lex.lang = value
 
     @staticmethod
     cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
@@ -67,6 +69,8 @@ cdef class Lexeme:
             return lex.length
         elif feat_name == CLUSTER:
             return lex.cluster
+        elif feat_name == LANG:
+            return lex.lang
         else:
             return 0
     
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 1aec4a018..4e0f2cf2e 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -18,10 +18,10 @@ import numpy
 
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport FLAG14 as IS_BRACKET
-from .attrs cimport FLAG15 as IS_QUOTE
-from .attrs cimport FLAG16 as IS_LEFT_PUNCT
-from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
+from .attrs cimport IS_BRACKET
+from .attrs cimport IS_QUOTE
+from .attrs cimport IS_LEFT_PUNCT
+from .attrs cimport IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV
 
 
@@ -123,6 +123,10 @@ cdef class Lexeme:
         def __get__(self): return self.c.cluster
         def __set__(self, int x): self.c.cluster = x
  
+    property lang:
+        def __get__(self): return self.c.lang
+        def __set__(self, int x): self.c.lang = x
+
     property prob:
         def __get__(self): return self.c.prob
         def __set__(self, float x): self.c.prob = x
@@ -147,6 +151,10 @@ cdef class Lexeme:
         def __get__(self): return self.vocab.strings[self.c.suffix]
         def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x]
 
+    property lang_:
+        def __get__(self): return self.vocab.strings[self.c.lang]
+        def __set__(self, unicode x): self.c.lang = self.vocab.strings[x]
+
     property flags:
         def __get__(self): return self.c.flags
         def __set__(self, flags_t x): self.c.flags = x
diff --git a/spacy/orth.pyx b/spacy/orth.pyx
index 418c3cfd4..0f30c1136 100644
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string):
 
 
 cpdef bint is_quote(unicode string):
-    quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯')
+    quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``')
     return string in quotes
 
 
 cpdef bint is_left_punct(unicode string):
-    left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮')        
+    left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``')
     return string in left_punct
 
 
 cpdef bint is_right_punct(unicode string):
-    right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯')        
+    right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''")
     return string in right_punct
 
 
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 733ce3022..f7e6b1ec7 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -9,6 +9,8 @@ cdef struct LexemeC:
 
     flags_t flags
 
+    attr_t lang
+
     attr_t id
     attr_t length
 
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index fa45c8b3e..887b1085f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -8,6 +8,7 @@ import struct
 cimport numpy as np
 import math
 import six
+import warnings
 
 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
@@ -23,7 +24,6 @@ from .token cimport Token
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 
-import npchunks
 
 DEF PADDING = 5
 
@@ -241,11 +241,23 @@ cdef class Doc:
                 "\npython -m spacy.en.download all\n"
                 "to install the data")
 
-        chunk_rules = {'en':npchunks.english, 'de':npchunks.german}
+        from spacy.en.iterators import noun_chunks as en_noun_chunks
+        from spacy.de.iterators import noun_chunks as de_noun_chunks
+
+        chunk_rules = {'en':en_noun_chunks, 
+                       'de':de_noun_chunks,
+                       }
 
         for sent in self.sents:
-            lang = 'en' # todo: make dependent on language of root token
-            for chunk in chunk_rules.get(lang)(sent):
+            print(sent)
+            lang = sent.root.lang_
+            chunker = chunk_rules.get(lang,None)
+            if chunker == None:
+                warnings.warn("noun_chunks is not available for language %s." % lang)
+                print(sent.root.orth_)
+                continue
+
+            for chunk in chunker(sent):
                 yield chunk
 
         
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 0ff574f1b..17d756b3e 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT
 
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport FLAG14 as IS_BRACKET
-from ..attrs cimport FLAG15 as IS_QUOTE
-from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
-from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
+from ..attrs cimport IS_BRACKET
+from ..attrs cimport IS_QUOTE
+from ..attrs cimport IS_LEFT_PUNCT
+from ..attrs cimport IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
 
@@ -95,6 +95,10 @@ cdef class Token:
         def __get__(self):
             return self.c.lex.prob
 
+    property lang:
+        def __get__(self):
+            return self.c.lex.lang
+
     property idx:
         def __get__(self):
             return self.c.idx
@@ -310,6 +314,10 @@ cdef class Token:
         def __get__(self):
             return self.vocab.strings[self.c.lex.suffix]
 
+    property lang_:
+        def __get__(self):
+            return self.vocab.strings[self.c.lex.lang]
+
     property lemma_:
         def __get__(self):
             return self.vocab.strings[self.c.lemma]
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index f876bfefb..df8a4bbd5 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -246,6 +246,7 @@ cdef class Vocab:
             fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
             fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
             fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
+            fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1)
         fp.close()
 
     def load_lexemes(self, loc):
@@ -278,6 +279,7 @@ cdef class Vocab:
             fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
             fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
             fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
+            fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang))
 
             lexeme.vector = EMPTY_VEC
             py_str = self.strings[lexeme.orth]