* Work on train

This commit is contained in:
Matthew Honnibal 2014-12-22 07:25:43 +11:00
parent 4d4d2c0db4
commit 4c4aa2c5c9
13 changed files with 214 additions and 128 deletions

44
spacy/en/__init__.py Normal file
View File

@ -0,0 +1,44 @@
from __future__ import unicode_literals
from os import path
from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from ..morphology import Morphologizer
from .lemmatizer import Lemmatizer
from .pos import EnPosTagger
from .attrs import get_flags
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
class English(object):
def __init__(self, data_dir=None, pos_tag=True, parse=False):
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
if pos_tag:
self.pos_tagger = EnPosTagger(data_dir,
Morphologizer.from_dir(
self.vocab.strings,
Lemmatizer(path.join(data_dir, 'wordnet')),
data_dir))
else:
self.pos_tagger = None
if parse:
self.parser = GreedyParser(data_dir)
else:
self.parser = None
def __call__(self, text, pos_tag=True, parse=True):
tokens = self.tokenizer.tokenize(text)
if self.pos_tagger and pos_tag:
self.pos_tagger(tokens)
if self.parser and parse:
self.parser.parse(tokens)
return tokens

View File

@ -1,13 +1,13 @@
from ..lexeme cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..lexeme cimport FLAG8, FLAG9
from ..lexeme cimport ID as _ID
from ..lexeme cimport SIC as _SIC
from ..lexeme cimport SHAPE as _SHAPE
from ..lexeme cimport DENSE as _DENSE
from ..lexeme cimport SHAPE as _SHAPE
from ..lexeme cimport PREFIX as _PREFIX
from ..lexeme cimport SUFFIX as _SUFFIX
from ..lexeme cimport LEMMA as _LEMMA
from ..typedefs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..typedefs cimport FLAG8, FLAG9
from ..typedefs cimport ID as _ID
from ..typedefs cimport SIC as _SIC
from ..typedefs cimport SHAPE as _SHAPE
from ..typedefs cimport DENSE as _DENSE
from ..typedefs cimport SHAPE as _SHAPE
from ..typedefs cimport PREFIX as _PREFIX
from ..typedefs cimport SUFFIX as _SUFFIX
from ..typedefs cimport LEMMA as _LEMMA
# Work around the lack of global cpdef variables

View File

@ -3,4 +3,4 @@ from ..morphology cimport Morphologizer
cdef class EnPosTagger(Tagger):
cdef Morphologizer morphologizer
cdef readonly Morphologizer morphologizer

View File

@ -1,88 +1,9 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
from .structs cimport Lexeme
from .strings cimport StringStore
# Reserve 64 values for flag features
cpdef enum attr_id_t:
FLAG0
FLAG1
FLAG2
FLAG3
FLAG4
FLAG5
FLAG6
FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
FLAG13
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
SIC
DENSE
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
POS_TYPE
LEMMA
cdef Lexeme EMPTY_LEXEME

View File

@ -24,7 +24,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
lex.prefix = string_store[string[:1]]
lex.suffix = string_store[string[-3:]]
lex.shape = string_store[orth.word_shape(string)]
lex.dense = string_store[props['dense']]
lex.flags = props.get('flags', 0)
return lex

View File

@ -59,9 +59,10 @@ cdef class Morphologizer:
@classmethod
def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
tag_map = None
irregulars = None
tag_names = None
tagger_cfg = json.loads(open(path.join(data_dir, 'pos', 'config.json')).read())
tag_map = tagger_cfg['tag_map']
tag_names = tagger_cfg['tag_names']
irregulars = json.loads(open(path.join(data_dir, 'morphs.json')).read())
return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
tag_names=tag_names)

View File

@ -11,6 +11,11 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
cdef class _SymbolMap:
cdef dict _string_to_id
cdef list _id_to_string
cdef class StringStore:
cdef Pool mem
cdef Utf8Str* strings

View File

@ -9,13 +9,42 @@ from .typedefs cimport hash_t
SEPARATOR = '\n|-SEP-|\n'
cdef class _SymbolMap:
def __init__(self):
self._string_to_id = {'': 0}
self._id_to_string = ['']
def __iter__(self):
for id_, string in enumerate(self._id_to_string[1:]):
yield string, id_
def __getitem__(self, object string_or_id):
cdef bytes byte_string
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id)
return self._int_to_string[string_or_id]
else:
string = string_or_id
if isinstance(string, unicode):
string = string.encode('utf8')
if string in self._string_to_id:
id_ = self._string_to_id[string]
else:
id_ = len(self._string_to_id)
self._string_to_id[string] = id_
self._id_to_string.append(string)
return id_
cdef class StringStore:
def __init__(self):
self.mem = Pool()
self._map = PreshMap()
self._resize_at = 10000
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
self.pos_tags = _SymbolMap()
self.dep_tags = _SymbolMap()
property size:
def __get__(self):

View File

@ -1,11 +1,10 @@
from libc.stdint cimport uint32_t, uint64_t
from thinc.features cimport Extractor
from thinc.learner cimport LinearModel
from .arc_eager cimport TransitionSystem
from ..tokens cimport Tokens, TokenC
from ._state cimport State
from ..structs cimport TokenC
from ..tokens cimport Tokens
cdef class GreedyParser:

View File

@ -3,6 +3,7 @@
from __future__ import unicode_literals
from os import path
import re
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
@ -27,7 +28,7 @@ cdef class Tokenizer:
self._prefix_re = prefix_re
self._suffix_re = suffix_re
self._infix_re = infix_re
self.vocab = Vocab(self.get_props)
self.vocab = vocab
self._load_special_tokenization(rules)
@classmethod
@ -39,11 +40,12 @@ cdef class Tokenizer:
assert path.exists(data_dir) and path.isdir(data_dir)
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re),
re.compile(infix_re))
cpdef Tokens tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.vocab.strings, length)
cdef Tokens tokens = Tokens(self.vocab, length)
if length == 0:
return tokens
cdef UniStr string_struct
@ -76,7 +78,7 @@ cdef class Tokenizer:
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
"""
cdef int length = len(string)
cdef Tokens tokens = Tokens(self.vocab.strings, length)
cdef Tokens tokens = Tokens(self.vocab, length)
if length == 0:
return tokens
cdef int i = 0

View File

@ -4,11 +4,11 @@ import numpy as np
cimport numpy as np
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .structs cimport Lexeme, TokenC, Morphology
from .typedefs cimport flags_t, attr_t, flags_t
from .typedefs cimport flags_t
from .structs cimport Morphology, TokenC, Lexeme
from .vocab cimport Vocab
from .strings cimport StringStore
@ -22,7 +22,7 @@ ctypedef fused LexemeOrToken:
cdef class Tokens:
cdef Pool mem
cdef StringStore strings
cdef Vocab vocab
cdef list tag_names
cdef TokenC* data
@ -36,7 +36,7 @@ cdef class Tokens:
cdef class Token:
cdef public StringStore strings
cdef readonly StringStore string_store
cdef public int i
cdef public int idx
cdef int pos
@ -44,18 +44,18 @@ cdef class Token:
cdef public int head
cdef public int dep_tag
cdef public attr_t id
cdef public attr_t cluster
cdef public attr_t length
cdef public attr_t postype
cdef public attr_t sensetype
cdef public atom_t id
cdef public atom_t cluster
cdef public atom_t length
cdef public atom_t postype
cdef public atom_t sensetype
cdef public attr_t sic
cdef public attr_t norm
cdef public attr_t shape
cdef public attr_t asciied
cdef public attr_t prefix
cdef public attr_t suffix
cdef public atom_t sic
cdef public atom_t norm
cdef public atom_t shape
cdef public atom_t asciied
cdef public atom_t prefix
cdef public atom_t suffix
cdef public float prob

View File

@ -2,7 +2,9 @@
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t
from .lexeme cimport get_attr, EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
cimport cython
import numpy as np
@ -30,8 +32,8 @@ cdef class Tokens:
>>> from spacy.en import EN
>>> tokens = EN.tokenize('An example sentence.')
"""
def __init__(self, StringStore string_store, string_length=0):
self.string_store = string_store
def __init__(self, Vocab vocab, string_length=0):
self.vocab = vocab
if string_length >= 3:
size = int(string_length / 3.0)
else:
@ -50,7 +52,7 @@ cdef class Tokens:
def __getitem__(self, i):
bounds_check(i, self.length, PADDING)
return Token(self.string_store, i, self.data[i].idx, self.data[i].pos,
return Token(self.vocab.strings, i, self.data[i].idx, self.data[i].pos,
self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
self.data[i].lex[0])
@ -119,10 +121,10 @@ cdef class Token:
int pos, int lemma, int head, int dep_tag, dict lex):
self.string_store = string_store
self.idx = idx
self.pos = pos
self.pos_id = pos
self.i = i
self.head = head
self.dep_tag = dep_tag
self.dep_id = dep_tag
self.id = lex['id']
self.lemma = lemma
@ -154,6 +156,9 @@ cdef class Token:
cdef bytes utf8string = self.string_store[self.lemma]
return utf8string.decode('utf8')
property dep:
def __get__(self):
return self.string_store.dep_tags[self.dep]
property pos:
def __get__(self):
return self.lang.pos_tagger.tag_names[self.pos]
return self.string_store.pos_tags[self.pos]

View File

@ -21,6 +21,87 @@ cpdef enum univ_tag_t:
N_UNIV_TAGS
# Reserve 64 values for flag features
cpdef enum attr_id_t:
FLAG0
FLAG1
FLAG2
FLAG3
FLAG4
FLAG5
FLAG6
FLAG7
FLAG8
FLAG9
FLAG10
FLAG11
FLAG12
FLAG13
FLAG14
FLAG15
FLAG16
FLAG17
FLAG18
FLAG19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID
SIC
DENSE
SHAPE
PREFIX
SUFFIX
LENGTH
CLUSTER
POS_TYPE
LEMMA
ctypedef uint64_t hash_t
ctypedef char* utf8_t
ctypedef uint32_t attr_t