mirror of https://github.com/explosion/spaCy.git
* Work on train
This commit is contained in:
parent
4d4d2c0db4
commit
4c4aa2c5c9
|
@ -0,0 +1,44 @@
|
|||
from __future__ import unicode_literals
|
||||
from os import path
|
||||
|
||||
from .. import orth
|
||||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..syntax.parser import GreedyParser
|
||||
from ..tokens import Tokens
|
||||
from ..morphology import Morphologizer
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .pos import EnPosTagger
|
||||
from .attrs import get_flags
|
||||
|
||||
|
||||
def get_lex_props(string):
|
||||
return {'flags': get_flags(string), 'dense': 1}
|
||||
|
||||
|
||||
class English(object):
|
||||
def __init__(self, data_dir=None, pos_tag=True, parse=False):
|
||||
if data_dir is None:
|
||||
data_dir = path.join(path.dirname(__file__), 'data')
|
||||
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
|
||||
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
||||
if pos_tag:
|
||||
self.pos_tagger = EnPosTagger(data_dir,
|
||||
Morphologizer.from_dir(
|
||||
self.vocab.strings,
|
||||
Lemmatizer(path.join(data_dir, 'wordnet')),
|
||||
data_dir))
|
||||
else:
|
||||
self.pos_tagger = None
|
||||
if parse:
|
||||
self.parser = GreedyParser(data_dir)
|
||||
else:
|
||||
self.parser = None
|
||||
|
||||
def __call__(self, text, pos_tag=True, parse=True):
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
if self.pos_tagger and pos_tag:
|
||||
self.pos_tagger(tokens)
|
||||
if self.parser and parse:
|
||||
self.parser.parse(tokens)
|
||||
return tokens
|
|
@ -1,13 +1,13 @@
|
|||
from ..lexeme cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||
from ..lexeme cimport FLAG8, FLAG9
|
||||
from ..lexeme cimport ID as _ID
|
||||
from ..lexeme cimport SIC as _SIC
|
||||
from ..lexeme cimport SHAPE as _SHAPE
|
||||
from ..lexeme cimport DENSE as _DENSE
|
||||
from ..lexeme cimport SHAPE as _SHAPE
|
||||
from ..lexeme cimport PREFIX as _PREFIX
|
||||
from ..lexeme cimport SUFFIX as _SUFFIX
|
||||
from ..lexeme cimport LEMMA as _LEMMA
|
||||
from ..typedefs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||
from ..typedefs cimport FLAG8, FLAG9
|
||||
from ..typedefs cimport ID as _ID
|
||||
from ..typedefs cimport SIC as _SIC
|
||||
from ..typedefs cimport SHAPE as _SHAPE
|
||||
from ..typedefs cimport DENSE as _DENSE
|
||||
from ..typedefs cimport SHAPE as _SHAPE
|
||||
from ..typedefs cimport PREFIX as _PREFIX
|
||||
from ..typedefs cimport SUFFIX as _SUFFIX
|
||||
from ..typedefs cimport LEMMA as _LEMMA
|
||||
|
||||
|
||||
# Work around the lack of global cpdef variables
|
||||
|
|
|
@ -3,4 +3,4 @@ from ..morphology cimport Morphologizer
|
|||
|
||||
|
||||
cdef class EnPosTagger(Tagger):
|
||||
cdef Morphologizer morphologizer
|
||||
cdef readonly Morphologizer morphologizer
|
||||
|
|
|
@ -1,88 +1,9 @@
|
|||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t
|
||||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||
from .typedefs cimport ID, SIC, DENSE, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, POS_TYPE
|
||||
from .structs cimport Lexeme
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
FLAG0
|
||||
FLAG1
|
||||
FLAG2
|
||||
FLAG3
|
||||
FLAG4
|
||||
FLAG5
|
||||
FLAG6
|
||||
FLAG7
|
||||
FLAG8
|
||||
FLAG9
|
||||
FLAG10
|
||||
FLAG11
|
||||
FLAG12
|
||||
FLAG13
|
||||
FLAG14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
FLAG18
|
||||
FLAG19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
|
||||
ID
|
||||
SIC
|
||||
DENSE
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
LEMMA
|
||||
|
||||
|
||||
cdef Lexeme EMPTY_LEXEME
|
||||
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
|
|||
lex.prefix = string_store[string[:1]]
|
||||
lex.suffix = string_store[string[-3:]]
|
||||
lex.shape = string_store[orth.word_shape(string)]
|
||||
lex.dense = string_store[props['dense']]
|
||||
|
||||
lex.flags = props.get('flags', 0)
|
||||
return lex
|
||||
|
|
|
@ -59,9 +59,10 @@ cdef class Morphologizer:
|
|||
|
||||
@classmethod
|
||||
def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
|
||||
tag_map = None
|
||||
irregulars = None
|
||||
tag_names = None
|
||||
tagger_cfg = json.loads(open(path.join(data_dir, 'pos', 'config.json')).read())
|
||||
tag_map = tagger_cfg['tag_map']
|
||||
tag_names = tagger_cfg['tag_names']
|
||||
irregulars = json.loads(open(path.join(data_dir, 'morphs.json')).read())
|
||||
return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
|
||||
tag_names=tag_names)
|
||||
|
||||
|
|
|
@ -11,6 +11,11 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
|
|||
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
|
||||
|
||||
|
||||
cdef class _SymbolMap:
|
||||
cdef dict _string_to_id
|
||||
cdef list _id_to_string
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef Utf8Str* strings
|
||||
|
|
|
@ -9,13 +9,42 @@ from .typedefs cimport hash_t
|
|||
SEPARATOR = '\n|-SEP-|\n'
|
||||
|
||||
|
||||
cdef class _SymbolMap:
|
||||
def __init__(self):
|
||||
self._string_to_id = {'': 0}
|
||||
self._id_to_string = ['']
|
||||
|
||||
def __iter__(self):
|
||||
for id_, string in enumerate(self._id_to_string[1:]):
|
||||
yield string, id_
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
cdef bytes byte_string
|
||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||
if string_or_id < 1 or string_or_id >= self.size:
|
||||
raise IndexError(string_or_id)
|
||||
return self._int_to_string[string_or_id]
|
||||
else:
|
||||
string = string_or_id
|
||||
if isinstance(string, unicode):
|
||||
string = string.encode('utf8')
|
||||
if string in self._string_to_id:
|
||||
id_ = self._string_to_id[string]
|
||||
else:
|
||||
id_ = len(self._string_to_id)
|
||||
self._string_to_id[string] = id_
|
||||
self._id_to_string.append(string)
|
||||
return id_
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
def __init__(self):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._resize_at = 10000
|
||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.size = 1
|
||||
self.pos_tags = _SymbolMap()
|
||||
self.dep_tags = _SymbolMap()
|
||||
|
||||
property size:
|
||||
def __get__(self):
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.learner cimport LinearModel
|
||||
|
||||
from .arc_eager cimport TransitionSystem
|
||||
|
||||
from ..tokens cimport Tokens, TokenC
|
||||
from ._state cimport State
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens cimport Tokens
|
||||
|
||||
|
||||
cdef class GreedyParser:
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from os import path
|
||||
import re
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
|
@ -27,7 +28,7 @@ cdef class Tokenizer:
|
|||
self._prefix_re = prefix_re
|
||||
self._suffix_re = suffix_re
|
||||
self._infix_re = infix_re
|
||||
self.vocab = Vocab(self.get_props)
|
||||
self.vocab = vocab
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
@classmethod
|
||||
|
@ -39,11 +40,12 @@ cdef class Tokenizer:
|
|||
|
||||
assert path.exists(data_dir) and path.isdir(data_dir)
|
||||
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
|
||||
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
|
||||
return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re),
|
||||
re.compile(infix_re))
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
cdef int length = sum([len(s) for s in strings])
|
||||
cdef Tokens tokens = Tokens(self.vocab.strings, length)
|
||||
cdef Tokens tokens = Tokens(self.vocab, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef UniStr string_struct
|
||||
|
@ -76,7 +78,7 @@ cdef class Tokenizer:
|
|||
tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
|
||||
"""
|
||||
cdef int length = len(string)
|
||||
cdef Tokens tokens = Tokens(self.vocab.strings, length)
|
||||
cdef Tokens tokens = Tokens(self.vocab, length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef int i = 0
|
||||
|
|
|
@ -4,11 +4,11 @@ import numpy as np
|
|||
cimport numpy as np
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from .structs cimport Lexeme, TokenC, Morphology
|
||||
|
||||
from .typedefs cimport flags_t, attr_t, flags_t
|
||||
|
||||
from .typedefs cimport flags_t
|
||||
from .structs cimport Morphology, TokenC, Lexeme
|
||||
from .vocab cimport Vocab
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
|
@ -22,7 +22,7 @@ ctypedef fused LexemeOrToken:
|
|||
|
||||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef StringStore strings
|
||||
cdef Vocab vocab
|
||||
cdef list tag_names
|
||||
|
||||
cdef TokenC* data
|
||||
|
@ -36,7 +36,7 @@ cdef class Tokens:
|
|||
|
||||
|
||||
cdef class Token:
|
||||
cdef public StringStore strings
|
||||
cdef readonly StringStore string_store
|
||||
cdef public int i
|
||||
cdef public int idx
|
||||
cdef int pos
|
||||
|
@ -44,18 +44,18 @@ cdef class Token:
|
|||
cdef public int head
|
||||
cdef public int dep_tag
|
||||
|
||||
cdef public attr_t id
|
||||
cdef public attr_t cluster
|
||||
cdef public attr_t length
|
||||
cdef public attr_t postype
|
||||
cdef public attr_t sensetype
|
||||
cdef public atom_t id
|
||||
cdef public atom_t cluster
|
||||
cdef public atom_t length
|
||||
cdef public atom_t postype
|
||||
cdef public atom_t sensetype
|
||||
|
||||
cdef public attr_t sic
|
||||
cdef public attr_t norm
|
||||
cdef public attr_t shape
|
||||
cdef public attr_t asciied
|
||||
cdef public attr_t prefix
|
||||
cdef public attr_t suffix
|
||||
cdef public atom_t sic
|
||||
cdef public atom_t norm
|
||||
cdef public atom_t shape
|
||||
cdef public atom_t asciied
|
||||
cdef public atom_t prefix
|
||||
cdef public atom_t suffix
|
||||
|
||||
cdef public float prob
|
||||
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
|
||||
from .lexeme cimport get_attr, EMPTY_LEXEME, LEMMA, attr_id_t
|
||||
from .lexeme cimport get_attr, EMPTY_LEXEME
|
||||
from .typedefs cimport attr_id_t, attr_t
|
||||
from .typedefs cimport LEMMA
|
||||
cimport cython
|
||||
|
||||
import numpy as np
|
||||
|
@ -30,8 +32,8 @@ cdef class Tokens:
|
|||
>>> from spacy.en import EN
|
||||
>>> tokens = EN.tokenize('An example sentence.')
|
||||
"""
|
||||
def __init__(self, StringStore string_store, string_length=0):
|
||||
self.string_store = string_store
|
||||
def __init__(self, Vocab vocab, string_length=0):
|
||||
self.vocab = vocab
|
||||
if string_length >= 3:
|
||||
size = int(string_length / 3.0)
|
||||
else:
|
||||
|
@ -50,7 +52,7 @@ cdef class Tokens:
|
|||
|
||||
def __getitem__(self, i):
|
||||
bounds_check(i, self.length, PADDING)
|
||||
return Token(self.string_store, i, self.data[i].idx, self.data[i].pos,
|
||||
return Token(self.vocab.strings, i, self.data[i].idx, self.data[i].pos,
|
||||
self.data[i].lemma, self.data[i].head, self.data[i].dep_tag,
|
||||
self.data[i].lex[0])
|
||||
|
||||
|
@ -119,10 +121,10 @@ cdef class Token:
|
|||
int pos, int lemma, int head, int dep_tag, dict lex):
|
||||
self.string_store = string_store
|
||||
self.idx = idx
|
||||
self.pos = pos
|
||||
self.pos_id = pos
|
||||
self.i = i
|
||||
self.head = head
|
||||
self.dep_tag = dep_tag
|
||||
self.dep_id = dep_tag
|
||||
self.id = lex['id']
|
||||
|
||||
self.lemma = lemma
|
||||
|
@ -154,6 +156,9 @@ cdef class Token:
|
|||
cdef bytes utf8string = self.string_store[self.lemma]
|
||||
return utf8string.decode('utf8')
|
||||
|
||||
property dep:
|
||||
def __get__(self):
|
||||
return self.string_store.dep_tags[self.dep]
|
||||
property pos:
|
||||
def __get__(self):
|
||||
return self.lang.pos_tagger.tag_names[self.pos]
|
||||
return self.string_store.pos_tags[self.pos]
|
||||
|
|
|
@ -21,6 +21,87 @@ cpdef enum univ_tag_t:
|
|||
N_UNIV_TAGS
|
||||
|
||||
|
||||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
FLAG0
|
||||
FLAG1
|
||||
FLAG2
|
||||
FLAG3
|
||||
FLAG4
|
||||
FLAG5
|
||||
FLAG6
|
||||
FLAG7
|
||||
FLAG8
|
||||
FLAG9
|
||||
FLAG10
|
||||
FLAG11
|
||||
FLAG12
|
||||
FLAG13
|
||||
FLAG14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
FLAG18
|
||||
FLAG19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
FLAG23
|
||||
FLAG24
|
||||
FLAG25
|
||||
FLAG26
|
||||
FLAG27
|
||||
FLAG28
|
||||
FLAG29
|
||||
FLAG30
|
||||
FLAG31
|
||||
FLAG32
|
||||
FLAG33
|
||||
FLAG34
|
||||
FLAG35
|
||||
FLAG36
|
||||
FLAG37
|
||||
FLAG38
|
||||
FLAG39
|
||||
FLAG40
|
||||
FLAG41
|
||||
FLAG42
|
||||
FLAG43
|
||||
FLAG44
|
||||
FLAG45
|
||||
FLAG46
|
||||
FLAG47
|
||||
FLAG48
|
||||
FLAG49
|
||||
FLAG50
|
||||
FLAG51
|
||||
FLAG52
|
||||
FLAG53
|
||||
FLAG54
|
||||
FLAG55
|
||||
FLAG56
|
||||
FLAG57
|
||||
FLAG58
|
||||
FLAG59
|
||||
FLAG60
|
||||
FLAG61
|
||||
FLAG62
|
||||
FLAG63
|
||||
|
||||
ID
|
||||
SIC
|
||||
DENSE
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
||||
LENGTH
|
||||
CLUSTER
|
||||
POS_TYPE
|
||||
LEMMA
|
||||
|
||||
|
||||
|
||||
ctypedef uint64_t hash_t
|
||||
ctypedef char* utf8_t
|
||||
ctypedef uint32_t attr_t
|
||||
|
|
Loading…
Reference in New Issue