mirror of https://github.com/explosion/spaCy.git
Refactor so that the tokenizer data is read from Python data, rather than from disk
This commit is contained in:
parent
d7e9acdcdf
commit
95aaea0d3f
|
@ -18,7 +18,7 @@ class German(Language):
|
|||
vectors = self.Vectors()
|
||||
# set a dummy lemmatizer for now that simply returns the same string
|
||||
# until the morphology is done for German
|
||||
return Vocab.load(self.path, get_lex_attr=lex_attr_getters, vectors=vectors,
|
||||
return Vocab.load(self.path, lex_attr_getters=lex_attr_getters, vectors=vectors,
|
||||
lemmatizer=False)
|
||||
|
||||
stop_words = set()
|
||||
|
|
|
@ -3,37 +3,57 @@ from __future__ import unicode_literals, print_function
|
|||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
|
||||
from . import language_data
|
||||
from .. import util
|
||||
from ..lemmatizer import Lemmatizer
|
||||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = 'en'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
def Vocab(self, lex_attr_getters=True, tag_map=True,
|
||||
lemmatizer=True, serializer_freqs=True, vectors=True):
|
||||
if lex_attr_getters is True:
|
||||
lex_attr_getters = self.lex_attr_getters
|
||||
if tag_map is True:
|
||||
tag_map = self.tag_map
|
||||
if lemmatizer is True:
|
||||
lemmatizer = self.Lemmatizer()
|
||||
return Vocab.load(self.path, lex_attr_getters=lex_attr_getters,
|
||||
tag_map=tag_map, lemmatizer=lemmatizer,
|
||||
serializer_freqs=serializer_freqs)
|
||||
|
||||
def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||
infix_finditer=None):
|
||||
if rules is None:
|
||||
rules = self.tokenizer_exceptions
|
||||
if prefix_search is None:
|
||||
prefix_search = util.compile_prefix_regex(self.prefixes).search
|
||||
if suffix_search is None:
|
||||
suffix_search = util.compile_suffix_regex(self.suffixes).search
|
||||
if infix_finditer is None:
|
||||
infix_finditer = util.compile_infix_regex(self.infixes).finditer
|
||||
return Tokenizer(vocab, rules=rules,
|
||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
||||
infix_finditer=infix_finditer)
|
||||
|
||||
def Lemmatizer(self):
|
||||
return Lemmatizer.load(self.path)
|
||||
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
||||
# improved list from Stone, Denis, Kwantes (2010)
|
||||
stop_words = set("""
|
||||
a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
|
||||
became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
|
||||
cannot cant co computer con could couldnt cry de describe
|
||||
detail did didn do does doesn doing don done down due during
|
||||
each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
|
||||
fify fill find fire first five for former formerly forty found four from front full further get give go
|
||||
had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
|
||||
if in inc indeed interest into is it its itself keep last latter latterly least less ltd
|
||||
just
|
||||
kg km
|
||||
made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
|
||||
neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
|
||||
often on once one only onto or other others otherwise our ours ourselves out over own part per
|
||||
perhaps please put rather re
|
||||
quite
|
||||
rather really regarding
|
||||
same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
|
||||
than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
|
||||
until up unless upon us used using
|
||||
various very very via
|
||||
was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
|
||||
your yours yourself yourselves
|
||||
""".split())
|
||||
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
|
||||
|
||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
|
||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
|
||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
||||
|
||||
tag_map = dict(language_data.TAG_MAP)
|
||||
|
||||
stop_words = set(language_data.STOP_WORDS)
|
||||
|
||||
|
|
|
@ -50,7 +50,7 @@ class BaseDefaults(object):
|
|||
lex_attr_getters = dict(self.lex_attr_getters)
|
||||
if vectors is None:
|
||||
vectors = self.Vectors()
|
||||
return Vocab.load(self.path, get_lex_attr=self.lex_attr_getters, vectors=vectors)
|
||||
return Vocab.load(self.path, lex_attr_getters=self.lex_attr_getters, vectors=vectors)
|
||||
|
||||
def Tokenizer(self, vocab):
|
||||
return Tokenizer.load(self.path, vocab)
|
||||
|
@ -84,7 +84,6 @@ class BaseDefaults(object):
|
|||
|
||||
ner_labels = {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
||||
|
||||
|
||||
stop_words = set()
|
||||
|
||||
lex_attr_getters = {
|
||||
|
@ -114,7 +113,6 @@ class BaseDefaults(object):
|
|||
}
|
||||
|
||||
|
||||
|
||||
class Language(object):
|
||||
'''A text-processing pipeline. Usually you'll load this once per process, and
|
||||
pass the instance around your program.
|
||||
|
@ -270,6 +268,5 @@ class Language(object):
|
|||
(HEAD, head_freqs)
|
||||
]))
|
||||
|
||||
|
||||
def get_defaults(self, path):
|
||||
return self.Defaults(self.lang, path)
|
||||
|
|
|
@ -16,7 +16,7 @@ def matcher():
|
|||
'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]],
|
||||
'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]],
|
||||
}
|
||||
return Matcher(Vocab(get_lex_attr=English.Defaults.lex_attr_getters), patterns)
|
||||
return Matcher(Vocab(lex_attr_getters=English.Defaults.lex_attr_getters), patterns)
|
||||
|
||||
|
||||
def test_compile(matcher):
|
||||
|
|
|
@ -45,12 +45,18 @@ cdef class Tokenizer:
|
|||
if rules is None:
|
||||
with (path / 'tokenizer' / 'specials.json').open() as file_:
|
||||
rules = json.load(file_)
|
||||
if prefix_search is None:
|
||||
prefix_search = util.read_prefix_regex(path / 'tokenizer' / 'prefix.txt').search
|
||||
if suffix_search is None:
|
||||
suffix_search = util.read_suffix_regex(path / 'tokenizer' / 'suffix.txt').search
|
||||
if infix_finditer is None:
|
||||
infix_finditer = util.read_infix_regex(path / 'tokenizer' / 'infix.txt').finditer
|
||||
if prefix_search in (None, True):
|
||||
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
prefix_search = util.compile_prefix_regex(entries).search
|
||||
if suffix_search in (None, True):
|
||||
with (path / 'tokenizer' / 'suffix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
suffix_search = util.compile_suffix_regex(entries).search
|
||||
if infix_finditer in (None, True):
|
||||
with (path / 'tokenizer' / 'infix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
infix_finditer = util.compile_infix_regex(entries).finditer
|
||||
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer)
|
||||
|
||||
|
||||
|
@ -76,8 +82,8 @@ cdef class Tokenizer:
|
|||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
self._prefix_re,
|
||||
self._suffix_re,
|
||||
self._infix_re)
|
||||
|
@ -297,7 +303,6 @@ cdef class Tokenizer:
|
|||
|
||||
def find_suffix(self, unicode string):
|
||||
match = self.suffix_search(string)
|
||||
print("Suffix", match, string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
|
|
|
@ -40,6 +40,15 @@ def set_data_path(path):
|
|||
_data_path = path
|
||||
|
||||
|
||||
def or_(val1, val2):
|
||||
if val1 is not None:
|
||||
return val1
|
||||
elif callable(val2):
|
||||
return val2()
|
||||
else:
|
||||
return val2
|
||||
|
||||
|
||||
|
||||
def match_best_version(target_name, target_version, path):
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
|
@ -80,26 +89,17 @@ def read_regex(path):
|
|||
return re.compile(expression)
|
||||
|
||||
|
||||
def read_prefix_regex(path):
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
with path.open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
def compile_prefix_regex(entries):
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
||||
def read_suffix_regex(path):
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
with path.open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
def compile_suffix_regex(entries):
|
||||
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
||||
def read_infix_regex(path):
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
with path.open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
def compile_infix_regex(entries):
|
||||
expression = '|'.join([piece for piece in entries if piece.strip()])
|
||||
return re.compile(expression)
|
||||
|
||||
|
@ -126,3 +126,9 @@ def normalize_slice(length, start, stop, step=None):
|
|||
|
||||
def utf8open(loc, mode='r'):
|
||||
return io.open(loc, mode, encoding='utf8')
|
||||
|
||||
|
||||
def check_renamed_kwargs(renamed, kwargs):
|
||||
for old, new in renamed.items():
|
||||
if old in kwargs:
|
||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||
|
|
|
@ -31,7 +31,7 @@ cdef class Vocab:
|
|||
cdef readonly int length
|
||||
cdef public object _serializer
|
||||
cdef public object data_dir
|
||||
cdef public object get_lex_attr
|
||||
cdef public object lex_attr_getters
|
||||
cdef public object serializer_freqs
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
|
|
|
@ -26,6 +26,8 @@ from . import symbols
|
|||
from cymem.cymem cimport Address
|
||||
from .serialize.packer cimport Packer
|
||||
from .attrs cimport PROB, LANG
|
||||
from . import deprecated
|
||||
from . import util
|
||||
|
||||
|
||||
try:
|
||||
|
@ -47,25 +49,20 @@ cdef class Vocab:
|
|||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
@classmethod
|
||||
def load(cls, path, get_lex_attr=None, vectors=True, lemmatizer=True):
|
||||
if (path / 'vocab' / 'tag_map.json').exists():
|
||||
def load(cls, path, lex_attr_getters=None, vectors=True, lemmatizer=True,
|
||||
tag_map=True, serializer_freqs=None, **deprecated_kwargs):
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
||||
with (path / 'vocab' / 'tag_map.json').open() as file_:
|
||||
tag_map = json.load(file_)
|
||||
else:
|
||||
tag_map = {}
|
||||
|
||||
if lemmatizer is True:
|
||||
lemmatizer = Lemmatizer.load(path)
|
||||
elif not lemmatizer:
|
||||
lemmatizer = lambda string, pos: set((string,))
|
||||
|
||||
if (path / 'vocab' / 'serializer.json').exists():
|
||||
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
|
||||
with (path / 'vocab' / 'serializer.json').open() as file_:
|
||||
serializer_freqs = json.load(file_)
|
||||
else:
|
||||
serializer_freqs = {}
|
||||
|
||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
|
||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
||||
|
||||
with (path / 'vocab' / 'strings.json').open() as file_:
|
||||
|
@ -82,11 +79,16 @@ cdef class Vocab:
|
|||
self.vectors_length = vectors(self)
|
||||
return self
|
||||
|
||||
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
|
||||
if tag_map is None:
|
||||
tag_map = {}
|
||||
if lemmatizer is None:
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
serializer_freqs=None, **deprecated_kwargs):
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
tag_map = tag_map if tag_map is not None else {}
|
||||
if lemmatizer in (None, True, False):
|
||||
lemmatizer = Lemmatizer({}, {}, {})
|
||||
serializer_freqs = serializer_freqs if serializer_freqs is not None else {}
|
||||
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
|
@ -102,13 +104,12 @@ cdef class Vocab:
|
|||
for name in symbols.NAMES + list(sorted(tag_map.keys())):
|
||||
if name:
|
||||
_ = self.strings[name]
|
||||
self.get_lex_attr = get_lex_attr
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
|
||||
self.serializer_freqs = serializer_freqs
|
||||
|
||||
self.length = 1
|
||||
self._serializer = None
|
||||
print("Vocab lang", self.lang)
|
||||
|
||||
property serializer:
|
||||
def __get__(self):
|
||||
|
@ -120,8 +121,8 @@ cdef class Vocab:
|
|||
property lang:
|
||||
def __get__(self):
|
||||
langfunc = None
|
||||
if self.get_lex_attr:
|
||||
langfunc = self.get_lex_attr.get(LANG, None)
|
||||
if self.lex_attr_getters:
|
||||
langfunc = self.lex_attr_getters.get(LANG, None)
|
||||
return langfunc('_') if langfunc else ''
|
||||
|
||||
def __len__(self):
|
||||
|
@ -169,8 +170,8 @@ cdef class Vocab:
|
|||
lex.length = len(string)
|
||||
lex.id = self.length
|
||||
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
||||
if self.get_lex_attr is not None:
|
||||
for attr, func in self.get_lex_attr.items():
|
||||
if self.lex_attr_getters is not None:
|
||||
for attr, func in self.lex_attr_getters.items():
|
||||
value = func(string)
|
||||
if isinstance(value, unicode):
|
||||
value = self.strings[value]
|
||||
|
|
Loading…
Reference in New Issue