From 03fb498dbeb86e52b9b3e487ab8edfd836b53660 Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Thu, 10 Mar 2016 13:01:34 +0100 Subject: [PATCH] introduce lang field for LexemeC to hold language id put noun_chunk logic into iterators.py for each language separately --- bin/init_model.py | 4 +-- setup.py | 5 +-- spacy/attrs.pxd | 20 +++++------ spacy/attrs.pyx | 9 ++--- .../{tokens/npchunks.pxd => de/iterators.pxd} | 0 .../{tokens/npchunks.pyx => de/iterators.pyx} | 34 +++---------------- spacy/en/iterators.pxd | 0 spacy/en/iterators.pyx | 24 +++++++++++++ spacy/language.py | 9 ++--- spacy/lexeme.pxd | 6 +++- spacy/lexeme.pyx | 16 ++++++--- spacy/orth.pyx | 6 ++-- spacy/structs.pxd | 2 ++ spacy/tokens/doc.pyx | 20 ++++++++--- spacy/tokens/token.pyx | 16 ++++++--- spacy/vocab.pyx | 2 ++ 16 files changed, 103 insertions(+), 70 deletions(-) rename spacy/{tokens/npchunks.pxd => de/iterators.pxd} (100%) rename spacy/{tokens/npchunks.pyx => de/iterators.pyx} (53%) create mode 100644 spacy/en/iterators.pxd create mode 100644 spacy/en/iterators.pyx diff --git a/bin/init_model.py b/bin/init_model.py index 19cfcdc25..5e62a7faf 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -109,7 +109,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): else: file_ = loc.open() for i, line in enumerate(file_): - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq @@ -121,7 +121,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200): file_ = loc.open() probs = {} for line in file_: - freq, doc_freq, key = line.split('\t', 2) + freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: diff --git a/setup.py b/setup.py index de7d95d22..7449212b9 100644 --- a/setup.py +++ b/setup.py @@ -56,14 +56,15 @@ MOD_NAMES = [ 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', - 'spacy.tokens.npchunks', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner', - 'spacy.symbols'] + 'spacy.symbols', + 'spacy.en.iterators', + 'spacy.de.iterators'] # By subclassing build_extensions we have the actual compiler that will be used diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 61a00ba1b..a878a49d8 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -14,12 +14,12 @@ cpdef enum attr_id_t: LIKE_EMAIL IS_STOP IS_OOV - - FLAG14 = 14 - FLAG15 - FLAG16 - FLAG17 - FLAG18 + IS_BRACKET + IS_QUOTE + IS_LEFT_PUNCT + IS_RIGHT_PUNCT + + FLAG18 = 18 FLAG19 FLAG20 FLAG21 @@ -85,11 +85,7 @@ cpdef enum attr_id_t: HEAD SPACY PROB + + LANG -# Move these up to FLAG14--FLAG18 once we finish the functionality and -# are ready to regenerate the model -#IS_BRACKET -#IS_QUOTE -#IS_LEFT_PUNCT -#IS_RIGHT_PUNCT diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 146f3ab26..9a191beda 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -13,10 +13,10 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, - "FLAG15": FLAG15, - "FLAG16": FLAG16, - "FLAG17": FLAG17, + "IS_BRACKET": IS_BRACKET, + "IS_QUOTE": IS_QUOTE, + "IS_LEFT_PUNCT": IS_LEFT_PUNCT, + "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, "FLAG18": FLAG18, "FLAG19": FLAG19, "FLAG20": FLAG20, @@ -83,6 +83,7 @@ IDS = { "HEAD": HEAD, "SPACY": SPACY, "PROB": PROB, + "LANG": LANG, } # ATTR IDs, in order of the symbol diff --git a/spacy/tokens/npchunks.pxd b/spacy/de/iterators.pxd similarity index 100% rename from spacy/tokens/npchunks.pxd rename to spacy/de/iterators.pxd diff --git a/spacy/tokens/npchunks.pyx b/spacy/de/iterators.pyx similarity index 53% rename from spacy/tokens/npchunks.pyx rename to spacy/de/iterators.pyx index 0c5ca32a5..a6321bd57 100644 --- a/spacy/tokens/npchunks.pyx +++ b/spacy/de/iterators.pyx @@ -1,31 +1,9 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span -from ..structs cimport TokenC -from .doc cimport Doc -from .span cimport Span +from spacy.parts_of_speech cimport NOUN -from ..parts_of_speech cimport NOUN, PROPN, PRON - -def english(Span sent): - cdef const TokenC* word - strings = sent.doc.vocab.strings - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] - np_deps = [strings[label] for label in labels] - conj = strings['conj'] - np_label = strings['NP'] - for i in range(sent.start, sent.end): - word = &sent.doc.c[i] - if word.pos == NOUN and word.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - elif word.pos == NOUN and word.dep == conj: - head = word+word.head - while head.dep == conj and head.head < 0: - head += head.head - # If the head is an NP, and we're coordinated to it, we're an NP - if head.dep in np_deps: - yield Span(sent.doc, word.l_edge, i+1, label=np_label) - - -def german(Span sent): +def noun_chunks(Span sent): # this function extracts spans headed by NOUNs starting from the left-most # syntactic dependent until the NOUN itself # for close apposition and measurement construction, the span is sometimes @@ -48,7 +26,3 @@ def german(Span sent): if rdep.pos == NOUN and rdep.dep == close_app: rbracket = rdep.i+1 yield Span(sent.doc, word.l_edge, rbracket, label=np_label) - - - - diff --git a/spacy/en/iterators.pxd b/spacy/en/iterators.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/en/iterators.pyx b/spacy/en/iterators.pyx new file mode 100644 index 000000000..e4f0fe2a4 --- /dev/null +++ b/spacy/en/iterators.pyx @@ -0,0 +1,24 @@ +from spacy.structs cimport TokenC +from spacy.tokens.span cimport Span + +from spacy.parts_of_speech cimport NOUN + +def noun_chunks(Span sent): + cdef const TokenC* word + strings = sent.doc.vocab.strings + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'root'] + np_deps = [strings[label] for label in labels] + conj = strings['conj'] + np_label = strings['NP'] + for i in range(sent.start, sent.end): + word = &sent.doc.c[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + elif word.pos == NOUN and word.dep == conj: + head = word+word.head + while head.dep == conj and head.head < 0: + head += head.head + # If the head is an NP, and we're coordinated to it, we're an NP + if head.dep in np_deps: + yield Span(sent.doc, word.l_edge, i+1, label=np_label) + diff --git a/spacy/language.py b/spacy/language.py index 4df34d956..f186c2f2b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -69,6 +69,7 @@ class Language(object): attrs.SUFFIX: cls.suffix, attrs.CLUSTER: cls.cluster, attrs.PROB: lambda string: oov_prob, + attrs.LANG: lambda string: cls.lang, attrs.IS_ALPHA: orth.is_alpha, attrs.IS_ASCII: orth.is_ascii, attrs.IS_DIGIT: cls.is_digit, @@ -77,10 +78,10 @@ class Language(object): attrs.IS_SPACE: cls.is_space, attrs.IS_TITLE: orth.is_title, attrs.IS_UPPER: orth.is_upper, - attrs.FLAG14: orth.is_bracket, - attrs.FLAG15: orth.is_quote, - attrs.FLAG16: orth.is_left_punct, - attrs.FLAG17: orth.is_right_punct, + attrs.IS_BRACKET: orth.is_bracket, + attrs.IS_QUOTE: orth.is_quote, + attrs.IS_LEFT_PUNCT: orth.is_left_punct, + attrs.IS_RIGHT_PUNCT: orth.is_right_punct, attrs.LIKE_URL: orth.like_url, attrs.LIKE_NUM: orth.like_number, attrs.LIKE_EMAIL: orth.like_email, diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 6fc25efb6..12d4e3de3 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,6 +1,6 @@ from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .attrs cimport attr_id_t -from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER +from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG from .structs cimport LexemeC from .strings cimport StringStore @@ -41,6 +41,8 @@ cdef class Lexeme: lex.suffix = value elif name == CLUSTER: lex.cluster = value + elif name == LANG: + lex.lang = value @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -67,6 +69,8 @@ cdef class Lexeme: return lex.length elif feat_name == CLUSTER: return lex.cluster + elif feat_name == LANG: + return lex.lang else: return 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1aec4a018..4e0f2cf2e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -18,10 +18,10 @@ import numpy from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from .attrs cimport FLAG14 as IS_BRACKET -from .attrs cimport FLAG15 as IS_QUOTE -from .attrs cimport FLAG16 as IS_LEFT_PUNCT -from .attrs cimport FLAG17 as IS_RIGHT_PUNCT +from .attrs cimport IS_BRACKET +from .attrs cimport IS_QUOTE +from .attrs cimport IS_LEFT_PUNCT +from .attrs cimport IS_RIGHT_PUNCT from .attrs cimport IS_OOV @@ -123,6 +123,10 @@ cdef class Lexeme: def __get__(self): return self.c.cluster def __set__(self, int x): self.c.cluster = x + property lang: + def __get__(self): return self.c.lang + def __set__(self, int x): self.c.lang = x + property prob: def __get__(self): return self.c.prob def __set__(self, float x): self.c.prob = x @@ -147,6 +151,10 @@ cdef class Lexeme: def __get__(self): return self.vocab.strings[self.c.suffix] def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] + property lang_: + def __get__(self): return self.vocab.strings[self.c.lang] + def __set__(self, unicode x): self.c.lang = self.vocab.strings[x] + property flags: def __get__(self): return self.c.flags def __set__(self, flags_t x): self.c.flags = x diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 418c3cfd4..0f30c1136 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -40,17 +40,17 @@ cpdef bint is_bracket(unicode string): cpdef bint is_quote(unicode string): - quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯') + quotes = ('"',"'",'`','«','»','‘','’','‚','‛','“','”','„','‟','‹','›','❮','❯',"''",'``') return string in quotes cpdef bint is_left_punct(unicode string): - left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮') + left_punct = ('(','[','{','<','"',"'",'«','‘','‚','‛','“','„','‟','‹','❮','``') return string in left_punct cpdef bint is_right_punct(unicode string): - right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯') + right_punct = (')',']','}','>','"',"'",'»','’','”','›','❯',"''") return string in right_punct diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 733ce3022..f7e6b1ec7 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -9,6 +9,8 @@ cdef struct LexemeC: flags_t flags + attr_t lang + attr_t id attr_t length diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index fa45c8b3e..887b1085f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -8,6 +8,7 @@ import struct cimport numpy as np import math import six +import warnings from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME @@ -23,7 +24,6 @@ from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice -import npchunks DEF PADDING = 5 @@ -241,11 +241,23 @@ cdef class Doc: "\npython -m spacy.en.download all\n" "to install the data") - chunk_rules = {'en':npchunks.english, 'de':npchunks.german} + from spacy.en.iterators import noun_chunks as en_noun_chunks + from spacy.de.iterators import noun_chunks as de_noun_chunks + + chunk_rules = {'en':en_noun_chunks, + 'de':de_noun_chunks, + } for sent in self.sents: - lang = 'en' # todo: make dependent on language of root token - for chunk in chunk_rules.get(lang)(sent): + print(sent) + lang = sent.root.lang_ + chunker = chunk_rules.get(lang,None) + if chunker == None: + warnings.warn("noun_chunks is not available for language %s." % lang) + print(sent.root.orth_) + continue + + for chunk in chunker(sent): yield chunk diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 0ff574f1b..17d756b3e 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -18,10 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP from ..parts_of_speech cimport CONJ, PUNCT from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE -from ..attrs cimport FLAG14 as IS_BRACKET -from ..attrs cimport FLAG15 as IS_QUOTE -from ..attrs cimport FLAG16 as IS_LEFT_PUNCT -from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT +from ..attrs cimport IS_BRACKET +from ..attrs cimport IS_QUOTE +from ..attrs cimport IS_LEFT_PUNCT +from ..attrs cimport IS_RIGHT_PUNCT from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV @@ -95,6 +95,10 @@ cdef class Token: def __get__(self): return self.c.lex.prob + property lang: + def __get__(self): + return self.c.lex.lang + property idx: def __get__(self): return self.c.idx @@ -310,6 +314,10 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.lex.suffix] + property lang_: + def __get__(self): + return self.vocab.strings[self.c.lex.lang] + property lemma_: def __get__(self): return self.vocab.strings[self.c.lemma] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f876bfefb..df8a4bbd5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -246,6 +246,7 @@ cdef class Vocab: fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1) fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1) fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) + fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1) fp.close() def load_lexemes(self, loc): @@ -278,6 +279,7 @@ cdef class Vocab: fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) + fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang)) lexeme.vector = EMPTY_VEC py_str = self.strings[lexeme.orth]