diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py new file mode 100644 index 000000000..1cebec7ba --- /dev/null +++ b/spacy/en/__init__.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals +from os import path + +from .. import orth +from ..vocab import Vocab +from ..tokenizer import Tokenizer +from ..syntax.parser import GreedyParser +from ..tokens import Tokens +from ..morphology import Morphologizer +from .lemmatizer import Lemmatizer +from .pos import EnPosTagger +from .attrs import get_flags + + +def get_lex_props(string): + return {'flags': get_flags(string), 'dense': 1} + + +class English(object): + def __init__(self, data_dir=None, pos_tag=True, parse=False): + if data_dir is None: + data_dir = path.join(path.dirname(__file__), 'data') + self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props) + self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir) + if pos_tag: + self.pos_tagger = EnPosTagger(data_dir, + Morphologizer.from_dir( + self.vocab.strings, + Lemmatizer(path.join(data_dir, 'wordnet')), + data_dir)) + else: + self.pos_tagger = None + if parse: + self.parser = GreedyParser(data_dir) + else: + self.parser = None + + def __call__(self, text, pos_tag=True, parse=True): + tokens = self.tokenizer.tokenize(text) + if self.pos_tagger and pos_tag: + self.pos_tagger(tokens) + if self.parser and parse: + self.parser.parse(tokens) + return tokens diff --git a/spacy/en/attrs.pxd b/spacy/en/attrs.pxd index 3454e3368..a24aa4ebf 100644 --- a/spacy/en/attrs.pxd +++ b/spacy/en/attrs.pxd @@ -1,13 +1,13 @@ -from ..lexeme cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7 -from ..lexeme cimport FLAG8, FLAG9 -from ..lexeme cimport ID as _ID -from ..lexeme cimport SIC as _SIC -from ..lexeme cimport SHAPE as _SHAPE -from ..lexeme cimport DENSE as _DENSE -from ..lexeme cimport SHAPE as _SHAPE -from ..lexeme cimport PREFIX as _PREFIX -from ..lexeme cimport SUFFIX as _SUFFIX -from ..lexeme cimport LEMMA as _LEMMA +from ..typedefs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7 +from ..typedefs cimport FLAG8, FLAG9 +from ..typedefs cimport ID as _ID +from ..typedefs cimport SIC as _SIC +from ..typedefs cimport SHAPE as _SHAPE +from ..typedefs cimport DENSE as _DENSE +from ..typedefs cimport SHAPE as _SHAPE +from ..typedefs cimport PREFIX as _PREFIX +from ..typedefs cimport SUFFIX as _SUFFIX +from ..typedefs cimport LEMMA as _LEMMA # Work around the lack of global cpdef variables diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 9a92f411a..99c83d795 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -3,4 +3,4 @@ from ..morphology cimport Morphologizer cdef class EnPosTagger(Tagger): - cdef Morphologizer morphologizer + cdef readonly Morphologizer morphologizer diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 5926baa0c..35826ef55 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,88 +1,9 @@ -from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t +from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t +from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE from .structs cimport Lexeme from .strings cimport StringStore -# Reserve 64 values for flag features -cpdef enum attr_id_t: - FLAG0 - FLAG1 - FLAG2 - FLAG3 - FLAG4 - FLAG5 - FLAG6 - FLAG7 - FLAG8 - FLAG9 - FLAG10 - FLAG11 - FLAG12 - FLAG13 - FLAG14 - FLAG15 - FLAG16 - FLAG17 - FLAG18 - FLAG19 - FLAG20 - FLAG21 - FLAG22 - FLAG23 - FLAG24 - FLAG25 - FLAG26 - FLAG27 - FLAG28 - FLAG29 - FLAG30 - FLAG31 - FLAG32 - FLAG33 - FLAG34 - FLAG35 - FLAG36 - FLAG37 - FLAG38 - FLAG39 - FLAG40 - FLAG41 - FLAG42 - FLAG43 - FLAG44 - FLAG45 - FLAG46 - FLAG47 - FLAG48 - FLAG49 - FLAG50 - FLAG51 - FLAG52 - FLAG53 - FLAG54 - FLAG55 - FLAG56 - FLAG57 - FLAG58 - FLAG59 - FLAG60 - FLAG61 - FLAG62 - FLAG63 - - ID - SIC - DENSE - SHAPE - PREFIX - SUFFIX - - LENGTH - CLUSTER - POS_TYPE - LEMMA - - cdef Lexeme EMPTY_LEXEME diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f1974cbc9..07bb008f9 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -24,7 +24,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, lex.prefix = string_store[string[:1]] lex.suffix = string_store[string[-3:]] lex.shape = string_store[orth.word_shape(string)] - lex.dense = string_store[props['dense']] lex.flags = props.get('flags', 0) return lex diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index da4485960..9efee6da3 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -59,9 +59,10 @@ cdef class Morphologizer: @classmethod def from_dir(cls, StringStore strings, object lemmatizer, data_dir): - tag_map = None - irregulars = None - tag_names = None + tagger_cfg = json.loads(open(path.join(data_dir, 'pos', 'config.json')).read()) + tag_map = tagger_cfg['tag_map'] + tag_names = tagger_cfg['tag_names'] + irregulars = json.loads(open(path.join(data_dir, 'morphs.json')).read()) return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars, tag_names=tag_names) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 9c16cfe1c..d5b674527 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -11,6 +11,11 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) s.key = hash64(s.chars, (s.n * sizeof(Py_UNICODE)), 0) +cdef class _SymbolMap: + cdef dict _string_to_id + cdef list _id_to_string + + cdef class StringStore: cdef Pool mem cdef Utf8Str* strings diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 24c233cfb..c7aa9c7ac 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -9,13 +9,42 @@ from .typedefs cimport hash_t SEPARATOR = '\n|-SEP-|\n' +cdef class _SymbolMap: + def __init__(self): + self._string_to_id = {'': 0} + self._id_to_string = [''] + + def __iter__(self): + for id_, string in enumerate(self._id_to_string[1:]): + yield string, id_ + + def __getitem__(self, object string_or_id): + cdef bytes byte_string + if isinstance(string_or_id, int) or isinstance(string_or_id, long): + if string_or_id < 1 or string_or_id >= self.size: + raise IndexError(string_or_id) + return self._int_to_string[string_or_id] + else: + string = string_or_id + if isinstance(string, unicode): + string = string.encode('utf8') + if string in self._string_to_id: + id_ = self._string_to_id[string] + else: + id_ = len(self._string_to_id) + self._string_to_id[string] = id_ + self._id_to_string.append(string) + return id_ + + cdef class StringStore: def __init__(self): self.mem = Pool() self._map = PreshMap() self._resize_at = 10000 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) - self.size = 1 + self.pos_tags = _SymbolMap() + self.dep_tags = _SymbolMap() property size: def __get__(self): diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index be315059f..6fe9fc58c 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -1,11 +1,10 @@ -from libc.stdint cimport uint32_t, uint64_t from thinc.features cimport Extractor from thinc.learner cimport LinearModel from .arc_eager cimport TransitionSystem -from ..tokens cimport Tokens, TokenC -from ._state cimport State +from ..structs cimport TokenC +from ..tokens cimport Tokens cdef class GreedyParser: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 147bf0ce1..f24ed7425 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -3,6 +3,7 @@ from __future__ import unicode_literals from os import path +import re from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc @@ -27,7 +28,7 @@ cdef class Tokenizer: self._prefix_re = prefix_re self._suffix_re = suffix_re self._infix_re = infix_re - self.vocab = Vocab(self.get_props) + self.vocab = vocab self._load_special_tokenization(rules) @classmethod @@ -39,11 +40,12 @@ cdef class Tokenizer: assert path.exists(data_dir) and path.isdir(data_dir) rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir) - return cls(vocab, rules, prefix_re, suffix_re, infix_re) + return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re), + re.compile(infix_re)) cpdef Tokens tokens_from_list(self, list strings): cdef int length = sum([len(s) for s in strings]) - cdef Tokens tokens = Tokens(self.vocab.strings, length) + cdef Tokens tokens = Tokens(self.vocab, length) if length == 0: return tokens cdef UniStr string_struct @@ -76,7 +78,7 @@ cdef class Tokenizer: tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes. """ cdef int length = len(string) - cdef Tokens tokens = Tokens(self.vocab.strings, length) + cdef Tokens tokens = Tokens(self.vocab, length) if length == 0: return tokens cdef int i = 0 diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 12eb70cc1..ec16c77d6 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -4,11 +4,11 @@ import numpy as np cimport numpy as np from cymem.cymem cimport Pool +from thinc.typedefs cimport atom_t -from .structs cimport Lexeme, TokenC, Morphology - -from .typedefs cimport flags_t, attr_t, flags_t - +from .typedefs cimport flags_t +from .structs cimport Morphology, TokenC, Lexeme +from .vocab cimport Vocab from .strings cimport StringStore @@ -22,7 +22,7 @@ ctypedef fused LexemeOrToken: cdef class Tokens: cdef Pool mem - cdef StringStore strings + cdef Vocab vocab cdef list tag_names cdef TokenC* data @@ -36,7 +36,7 @@ cdef class Tokens: cdef class Token: - cdef public StringStore strings + cdef readonly StringStore string_store cdef public int i cdef public int idx cdef int pos @@ -44,18 +44,18 @@ cdef class Token: cdef public int head cdef public int dep_tag - cdef public attr_t id - cdef public attr_t cluster - cdef public attr_t length - cdef public attr_t postype - cdef public attr_t sensetype + cdef public atom_t id + cdef public atom_t cluster + cdef public atom_t length + cdef public atom_t postype + cdef public atom_t sensetype - cdef public attr_t sic - cdef public attr_t norm - cdef public attr_t shape - cdef public attr_t asciied - cdef public attr_t prefix - cdef public attr_t suffix + cdef public atom_t sic + cdef public atom_t norm + cdef public atom_t shape + cdef public atom_t asciied + cdef public atom_t prefix + cdef public atom_t suffix cdef public float prob diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index f4b1c952d..5e81c4a4e 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -2,7 +2,9 @@ from preshed.maps cimport PreshMap from preshed.counter cimport PreshCounter -from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t +from .lexeme cimport get_attr, EMPTY_LEXEME +from .typedefs cimport attr_id_t, attr_t +from .typedefs cimport LEMMA cimport cython import numpy as np @@ -30,8 +32,8 @@ cdef class Tokens: >>> from spacy.en import EN >>> tokens = EN.tokenize('An example sentence.') """ - def __init__(self, StringStore string_store, string_length=0): - self.string_store = string_store + def __init__(self, Vocab vocab, string_length=0): + self.vocab = vocab if string_length >= 3: size = int(string_length / 3.0) else: @@ -50,7 +52,7 @@ cdef class Tokens: def __getitem__(self, i): bounds_check(i, self.length, PADDING) - return Token(self.string_store, i, self.data[i].idx, self.data[i].pos, + return Token(self.vocab.strings, i, self.data[i].idx, self.data[i].pos, self.data[i].lemma, self.data[i].head, self.data[i].dep_tag, self.data[i].lex[0]) @@ -119,10 +121,10 @@ cdef class Token: int pos, int lemma, int head, int dep_tag, dict lex): self.string_store = string_store self.idx = idx - self.pos = pos + self.pos_id = pos self.i = i self.head = head - self.dep_tag = dep_tag + self.dep_id = dep_tag self.id = lex['id'] self.lemma = lemma @@ -154,6 +156,9 @@ cdef class Token: cdef bytes utf8string = self.string_store[self.lemma] return utf8string.decode('utf8') + property dep: + def __get__(self): + return self.string_store.dep_tags[self.dep] property pos: def __get__(self): - return self.lang.pos_tagger.tag_names[self.pos] + return self.string_store.pos_tags[self.pos] diff --git a/spacy/typedefs.pxd b/spacy/typedefs.pxd index f91f55469..4b387be7d 100644 --- a/spacy/typedefs.pxd +++ b/spacy/typedefs.pxd @@ -21,6 +21,87 @@ cpdef enum univ_tag_t: N_UNIV_TAGS +# Reserve 64 values for flag features +cpdef enum attr_id_t: + FLAG0 + FLAG1 + FLAG2 + FLAG3 + FLAG4 + FLAG5 + FLAG6 + FLAG7 + FLAG8 + FLAG9 + FLAG10 + FLAG11 + FLAG12 + FLAG13 + FLAG14 + FLAG15 + FLAG16 + FLAG17 + FLAG18 + FLAG19 + FLAG20 + FLAG21 + FLAG22 + FLAG23 + FLAG24 + FLAG25 + FLAG26 + FLAG27 + FLAG28 + FLAG29 + FLAG30 + FLAG31 + FLAG32 + FLAG33 + FLAG34 + FLAG35 + FLAG36 + FLAG37 + FLAG38 + FLAG39 + FLAG40 + FLAG41 + FLAG42 + FLAG43 + FLAG44 + FLAG45 + FLAG46 + FLAG47 + FLAG48 + FLAG49 + FLAG50 + FLAG51 + FLAG52 + FLAG53 + FLAG54 + FLAG55 + FLAG56 + FLAG57 + FLAG58 + FLAG59 + FLAG60 + FLAG61 + FLAG62 + FLAG63 + + ID + SIC + DENSE + SHAPE + PREFIX + SUFFIX + + LENGTH + CLUSTER + POS_TYPE + LEMMA + + + ctypedef uint64_t hash_t ctypedef char* utf8_t ctypedef uint32_t attr_t