spaCy/spacy/language.py

386 lines
15 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import dill
2017-05-18 09:25:19 +00:00
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .train import Trainer
from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity
from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
2017-04-16 23:46:14 +00:00
from .compat import json_dumps
from .attrs import IS_STOP
2017-05-08 21:58:31 +00:00
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
2017-05-08 22:58:10 +00:00
from .lang.lex_attrs import LEX_ATTRS
from . import util
2016-09-24 18:26:17 +00:00
class BaseDefaults(object):
2016-10-18 14:18:25 +00:00
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
2016-10-18 14:18:25 +00:00
@classmethod
def create_vocab(cls, nlp=None):
lemmatizer = cls.create_lemmatizer(nlp)
lex_attr_getters = dict(cls.lex_attr_getters)
# This is messy, but it's the minimal working fix to Issue #639.
lex_attr_getters[IS_STOP] = lambda string: string.lower() in cls.stop_words
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map,
lemmatizer=lemmatizer)
2017-03-15 14:24:40 +00:00
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
return vocab
2016-12-18 15:54:52 +00:00
2016-10-18 14:18:25 +00:00
@classmethod
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
if cls.prefixes else None
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
if cls.suffixes else None
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
if cls.infixes else None
2016-10-18 14:18:25 +00:00
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
2016-11-26 11:36:04 +00:00
return Tokenizer(vocab, rules=rules,
2016-10-18 14:18:25 +00:00
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, token_match=token_match)
@classmethod
def create_tagger(cls, nlp=None, **cfg):
if nlp is None:
return NeuralTagger(cls.create_vocab(nlp), **cfg)
else:
return NeuralTagger(nlp.vocab, **cfg)
@classmethod
def create_parser(cls, nlp=None, **cfg):
if nlp is None:
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
else:
return NeuralDependencyParser(nlp.vocab, **cfg)
@classmethod
def create_entity(cls, nlp=None, **cfg):
if nlp is None:
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
else:
return NeuralEntityRecognizer(nlp.vocab, **cfg)
2016-10-18 14:18:25 +00:00
@classmethod
def create_pipeline(cls, nlp=None):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in cls.pipeline:
factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline
factories = {
'make_doc': create_tokenizer,
'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
}
2017-05-08 21:58:31 +00:00
token_match = TOKEN_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP)
tokenizer_exceptions = {}
parser_features = get_templates('parser')
entity_features = get_templates('ner')
2016-10-18 14:18:25 +00:00
tagger_features = Tagger.feature_templates # TODO -- fix this
2016-09-24 18:26:17 +00:00
stop_words = set()
2016-12-18 14:50:09 +00:00
lemma_rules = {}
lemma_exc = {}
lemma_index = {}
2017-03-15 14:24:40 +00:00
morph_rules = {}
2017-05-08 22:58:10 +00:00
lex_attr_getters = LEX_ATTRS
2015-09-14 07:48:51 +00:00
class Language(object):
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
"""
2016-09-24 18:26:17 +00:00
Defaults = BaseDefaults
lang = None
2015-08-25 13:37:17 +00:00
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self.meta = dict(meta)
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
self.vocab = vocab
if make_doc is True:
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
self.make_doc = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self)
elif pipeline:
self.pipeline = list(pipeline)
# Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories:
factory = self.Defaults.factories[entry]
self.pipeline[i] = factory(self, **meta.get(entry, {}))
else:
self.pipeline = []
def __call__(self, text, state=None, **disabled):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
2015-08-25 13:37:17 +00:00
is preserved.
2016-12-18 15:54:52 +00:00
text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run.
RETURNS (Doc): A container for accessing the annotations.
2016-11-01 11:25:36 +00:00
EXAMPLE:
2016-11-01 11:25:36 +00:00
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].text, tokens[0].head.tag_
2016-11-01 11:25:36 +00:00
('An', 'NN')
2015-08-25 13:37:17 +00:00
"""
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
continue
state = proc(doc, state=state)
return doc
2015-08-25 13:37:17 +00:00
def update(self, docs, golds, state=None, drop=0., sgd=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
state = {} if state is None else state
for process in self.pipeline:
if hasattr(process, 'update'):
state = process.update(docs, golds, state=state, drop=drop,
sgd=get_grads)
else:
process(docs, state=state)
if sgd is not None:
for key, (W, dW) in grads.items():
2017-05-18 09:25:19 +00:00
# TODO: Unhack this when thinc improves
if isinstance(W, numpy.ndarray):
sgd.ops = NumpyOps()
else:
sgd.ops = CupyOps()
sgd(W, dW, key=key)
return state
@contextmanager
def begin_training(self, gold_tuples, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
# Populate vocab
for _, annots_brackets in gold_tuples:
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
# Handle crossing dependencies
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
contexts = []
2017-05-18 09:25:19 +00:00
if cfg.get('use_gpu'):
Model.ops = CupyOps()
Model.Ops = CupyOps
print("Use GPU")
for proc in self.pipeline:
if hasattr(proc, 'begin_training'):
context = proc.begin_training(gold_tuples,
pipeline=self.pipeline)
contexts.append(context)
trainer = Trainer(self, gold_tuples, **cfg)
yield trainer, trainer.optimizer
2017-05-18 09:25:19 +00:00
@contextmanager
def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
2017-05-18 13:30:59 +00:00
contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
for context in contexts:
try:
next(context)
except StopIteration:
pass
2017-05-18 09:25:19 +00:00
yield
for context in contexts:
try:
2017-05-18 13:30:59 +00:00
next(context)
2017-05-18 09:25:19 +00:00
except StopIteration:
pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
YIELDS (Doc): Documents in the order of the original text.
EXAMPLE:
>>> texts = [u'One document.', u'...', u'Lots of documents']
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
2017-05-18 13:30:59 +00:00
#stream = ((self.make_doc(text), None) for text in texts)
stream = ((doc, {}) for doc in texts)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
continue
if hasattr(proc, 'pipe'):
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
else:
stream = (proc(doc, state) for doc, state in stream)
for doc, state in stream:
yield doc
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
2017-04-16 23:40:26 +00:00
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
"""
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
if not path.is_dir():
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in exclude:
continue
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
else:
props[name] = value
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
bytes_data = file_.read()
self.from_bytes(bytes_data, **exclude)
return self
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
2016-12-18 15:54:52 +00:00
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in exclude:
if key in props:
props.pop(key)
return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
setattr(self, key, value)
return self