spaCy/spacy/language.py

587 lines
23 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import dill
2017-05-18 09:25:19 +00:00
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
2017-05-25 01:10:54 +00:00
from thinc.neural.optimizers import Adam, SGD
import random
2017-05-29 11:42:55 +00:00
import ujson
2017-05-29 13:40:45 +00:00
from collections import OrderedDict
2017-07-25 16:57:59 +00:00
import itertools
2017-05-18 09:25:19 +00:00
from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .syntax.parser import get_templates
2017-05-22 10:14:59 +00:00
from .syntax import nonproj
from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
2017-05-21 22:52:30 +00:00
from .pipeline import NeuralLabeller
from .pipeline import SimilarityHook
2017-07-21 23:13:36 +00:00
from .pipeline import TextCategorizer
2017-07-22 22:50:18 +00:00
from . import about
2017-07-25 16:57:59 +00:00
from .compat import json_dumps, izip
from .attrs import IS_STOP
2017-05-08 21:58:31 +00:00
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
2017-05-08 22:58:10 +00:00
from .lang.lex_attrs import LEX_ATTRS
from . import util
2017-05-21 14:07:06 +00:00
from .scorer import Scorer
2017-09-23 01:11:52 +00:00
from ._ml import link_vectors_to_models
2016-09-24 18:26:17 +00:00
class BaseDefaults(object):
2016-10-18 14:18:25 +00:00
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
2016-10-18 14:18:25 +00:00
@classmethod
def create_vocab(cls, nlp=None):
lemmatizer = cls.create_lemmatizer(nlp)
lex_attr_getters = dict(cls.lex_attr_getters)
# This is messy, but it's the minimal working fix to Issue #639.
lex_attr_getters[IS_STOP] = lambda string: string.lower() in cls.stop_words
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map,
lemmatizer=lemmatizer)
2017-03-15 14:24:40 +00:00
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
return vocab
2016-12-18 15:54:52 +00:00
2016-10-18 14:18:25 +00:00
@classmethod
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
if cls.prefixes else None
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
if cls.suffixes else None
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
if cls.infixes else None
2016-10-18 14:18:25 +00:00
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
2016-11-26 11:36:04 +00:00
return Tokenizer(vocab, rules=rules,
2016-10-18 14:18:25 +00:00
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, token_match=token_match)
@classmethod
def create_tagger(cls, nlp=None, **cfg):
if nlp is None:
return NeuralTagger(cls.create_vocab(nlp), **cfg)
else:
return NeuralTagger(nlp.vocab, **cfg)
@classmethod
def create_parser(cls, nlp=None, **cfg):
if nlp is None:
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
else:
return NeuralDependencyParser(nlp.vocab, **cfg)
@classmethod
def create_entity(cls, nlp=None, **cfg):
if nlp is None:
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
else:
return NeuralEntityRecognizer(nlp.vocab, **cfg)
2016-10-18 14:18:25 +00:00
@classmethod
def create_pipeline(cls, nlp=None, disable=tuple()):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
2017-08-19 19:58:57 +00:00
for entry in meta.get('pipeline', []):
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline
factories = {
'make_doc': create_tokenizer,
2017-05-31 11:42:39 +00:00
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'parser': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
2017-07-21 23:13:36 +00:00
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
2017-05-31 11:42:39 +00:00
# Temporary compatibility -- delete after pivot
2017-05-21 23:43:31 +00:00
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize,
],
2017-05-21 23:43:31 +00:00
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
}
2017-05-08 21:58:31 +00:00
token_match = TOKEN_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP)
tokenizer_exceptions = {}
parser_features = get_templates('parser')
entity_features = get_templates('ner')
2016-10-18 14:18:25 +00:00
tagger_features = Tagger.feature_templates # TODO -- fix this
2016-09-24 18:26:17 +00:00
stop_words = set()
2016-12-18 14:50:09 +00:00
lemma_rules = {}
lemma_exc = {}
lemma_index = {}
2017-03-15 14:24:40 +00:00
morph_rules = {}
2017-05-08 22:58:10 +00:00
lex_attr_getters = LEX_ATTRS
syntax_iterators = {}
2015-09-14 07:48:51 +00:00
class Language(object):
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
"""
2016-09-24 18:26:17 +00:00
Defaults = BaseDefaults
lang = None
2015-08-25 13:37:17 +00:00
2017-07-22 22:50:18 +00:00
def __init__(self, vocab=True, make_doc=True, pipeline=None,
meta={}, disable=tuple(), **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
disable (list): A list of component names to exclude from the pipeline.
The disable list has priority over the pipeline list -- if the same
string occurs in both, the component is not loaded.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
2017-07-22 22:50:18 +00:00
self._meta = dict(meta)
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
self.vocab = vocab
if make_doc is True:
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
2017-05-29 13:40:45 +00:00
self.tokenizer = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self, disable)
elif pipeline:
# Careful not to do getattr(p, 'name', None) here
# If we had disable=[None], we'd disable everything!
self.pipeline = [p for p in pipeline
if p not in disable
and getattr(p, 'name', p) not in disable]
# Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories:
factory = self.Defaults.factories[entry]
self.pipeline[i] = factory(self, **meta.get(entry, {}))
else:
self.pipeline = []
2017-05-21 23:43:31 +00:00
flat_list = []
for pipe in self.pipeline:
if isinstance(pipe, list):
flat_list.extend(pipe)
else:
flat_list.append(pipe)
self.pipeline = flat_list
self._optimizer = None
2017-07-22 22:50:18 +00:00
@property
def meta(self):
self._meta.setdefault('lang', self.vocab.lang)
self._meta.setdefault('name', '')
self._meta.setdefault('version', '0.0.0')
self._meta.setdefault('spacy_version', about.__version__)
self._meta.setdefault('description', '')
self._meta.setdefault('author', '')
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
pipeline = []
for component in self.pipeline:
if hasattr(component, 'name'):
pipeline.append(component.name)
self._meta['pipeline'] = pipeline
return self._meta
@meta.setter
def meta(self, value):
self._meta = value
# Conveniences to access pipeline components
@property
def tensorizer(self):
return self.get_component('tensorizer')
@property
def tagger(self):
return self.get_component('tagger')
@property
def parser(self):
return self.get_component('parser')
@property
def entity(self):
return self.get_component('ner')
@property
def matcher(self):
return self.get_component('matcher')
2017-08-20 20:58:34 +00:00
def get_component(self, name):
if self.pipeline in (True, None):
return None
for proc in self.pipeline:
if hasattr(proc, 'name') and proc.name.endswith(name):
return proc
return None
def __call__(self, text, disable=[]):
2017-05-21 18:46:23 +00:00
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
2015-08-25 13:37:17 +00:00
is preserved.
2016-12-18 15:54:52 +00:00
text (unicode): The text to be processed.
disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations.
2016-11-01 11:25:36 +00:00
EXAMPLE:
2016-11-01 11:25:36 +00:00
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].text, tokens[0].head.tag_
2016-11-01 11:25:36 +00:00
('An', 'NN')
2015-08-25 13:37:17 +00:00
"""
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disable:
continue
2017-05-28 13:11:58 +00:00
doc = proc(doc)
return doc
2015-08-25 13:37:17 +00:00
2017-05-29 13:40:45 +00:00
def make_doc(self, text):
return self.tokenizer(text)
2017-09-26 10:41:35 +00:00
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
2017-08-01 20:10:17 +00:00
if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds "
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0:
return
if sgd is None:
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
2017-05-25 01:10:54 +00:00
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
pipes = list(self.pipeline)
random.shuffle(pipes)
for proc in pipes:
2017-05-21 23:43:31 +00:00
if not hasattr(proc, 'update'):
continue
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
2017-08-20 20:58:34 +00:00
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
2017-05-21 14:07:06 +00:00
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
2017-05-21 14:07:06 +00:00
for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
yield doc, gold
2017-09-21 00:15:20 +00:00
def resume_training(self, **cfg):
if cfg.get('device', -1) >= 0:
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def begin_training(self, get_gold_tuples=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
get_gold_tuples (function): Function returning gold data
**cfg: Config parameters.
returns: An optimizer
"""
# Populate vocab
2017-09-21 00:15:20 +00:00
if get_gold_tuples is not None:
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
contexts = []
2017-06-03 21:10:23 +00:00
if cfg.get('device', -1) >= 0:
2017-09-21 00:15:20 +00:00
device = util.use_gpu(cfg['device'])
2017-09-18 23:04:16 +00:00
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
2017-06-03 21:10:23 +00:00
else:
device = None
2017-09-23 01:11:52 +00:00
link_vectors_to_models(self.vocab)
for proc in self.pipeline:
if hasattr(proc, 'begin_training'):
2017-05-21 14:07:06 +00:00
context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline)
contexts.append(context)
2017-05-25 16:19:26 +00:00
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
2017-05-21 14:07:06 +00:00
def evaluate(self, docs_golds):
scorer = Scorer()
2017-08-18 20:26:12 +00:00
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
else:
docs = list(pipe.pipe(docs))
assert len(docs) == len(golds)
for doc, gold in zip(docs, golds):
2017-05-21 14:07:06 +00:00
scorer.score(doc, gold)
return scorer
2017-05-18 09:25:19 +00:00
@contextmanager
def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
2017-05-18 13:30:59 +00:00
contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
for context in contexts:
try:
next(context)
except StopIteration:
pass
2017-05-18 09:25:19 +00:00
yield
for context in contexts:
try:
2017-05-18 13:30:59 +00:00
next(context)
2017-05-18 09:25:19 +00:00
except StopIteration:
pass
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
as_tuples (bool):
If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
EXAMPLE:
>>> texts = [u'One document.', u'...', u'Lots of documents']
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
if as_tuples:
2017-07-25 16:57:59 +00:00
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
disable=disable)
for doc, context in izip(docs, contexts):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disable:
continue
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
else:
2017-05-21 23:43:31 +00:00
# Apply the function, but yield the doc
docs = _pipe(proc, docs)
for doc in docs:
yield doc
2017-05-31 11:42:39 +00:00
def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
2017-04-16 23:40:26 +00:00
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
2017-05-31 11:42:39 +00:00
disable (list): Names of pipeline components to disable and prevent
from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
"""
path = util.ensure_path(path)
2017-05-31 11:42:39 +00:00
serializers = OrderedDict((
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
2017-09-24 10:01:45 +00:00
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
2017-05-31 11:42:39 +00:00
util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, disable=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
"""
path = util.ensure_path(path)
2017-05-31 11:42:39 +00:00
deserializers = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
2017-06-01 12:38:35 +00:00
exclude = {p: False for p in disable}
if not (path / 'vocab').exists():
exclude['vocab'] = True
util.from_disk(path, deserializers, exclude)
2017-05-31 11:42:39 +00:00
return self
def to_bytes(self, disable=[]):
"""Serialize the current state to a binary string.
2016-12-18 15:54:52 +00:00
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
2017-05-29 13:40:45 +00:00
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
2017-05-29 18:23:28 +00:00
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
2017-05-29 11:38:20 +00:00
return util.to_bytes(serializers, {})
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
2017-05-29 13:40:45 +00:00
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
continue
2017-05-29 13:40:45 +00:00
if not hasattr(proc, 'from_bytes'):
continue
2017-05-29 18:23:28 +00:00
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
msg = util.from_bytes(bytes_data, deserializers, {})
return self
2017-05-21 23:43:31 +00:00
2017-05-21 23:43:31 +00:00
def _pipe(func, docs):
for doc in docs:
func(doc)
yield doc