spaCy/spacy/language.py

258 lines
7.6 KiB
Python

from os import path
try:
import ujson as json
except ImportError:
import json
from .tokenizer import Tokenizer
from .morphology import Morphology
from .vocab import Vocab
from .syntax.parser import Parser
from .tagger import Tagger
from .matcher import Matcher
from .serialize.packer import Packer
from ._ml import Model
from . import attrs
from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
class Language(object):
@staticmethod
def lower(string):
return string.lower()
@staticmethod
def norm(string):
return string
@staticmethod
def shape(string):
return orth.word_shape(string)
@staticmethod
def prefix(string):
return string[0]
@staticmethod
def suffix(string):
return string[-3:]
@staticmethod
def prob(string):
return -30
@staticmethod
def cluster(string):
return 0
@staticmethod
def is_alpha(string):
return orth.is_alpha(string)
@staticmethod
def is_ascii(string):
return orth.is_ascii(string)
@staticmethod
def is_digit(string):
return string.isdigit()
@staticmethod
def is_lower(string):
return orth.is_lower(string)
@staticmethod
def is_punct(string):
return orth.is_punct(string)
@staticmethod
def is_space(string):
return string.isspace()
@staticmethod
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@staticmethod
def like_url(string):
return orth.like_url(string)
@staticmethod
def like_number(string):
return orth.like_number(string)
@staticmethod
def like_email(string):
return orth.like_email(string)
@classmethod
def default_lex_attrs(cls, data_dir=None):
return {
attrs.LOWER: cls.lower,
attrs.NORM: cls.norm,
attrs.SHAPE: cls.shape,
attrs.PREFIX: cls.prefix,
attrs.SUFFIX: cls.suffix,
attrs.CLUSTER: cls.cluster,
attrs.PROB: lambda string: -10.0,
attrs.IS_ALPHA: cls.is_alpha,
attrs.IS_ASCII: cls.is_ascii,
attrs.IS_DIGIT: cls.is_digit,
attrs.IS_LOWER: cls.is_lower,
attrs.IS_PUNCT: cls.is_punct,
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title,
attrs.IS_UPPER: cls.is_upper,
attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_number,
attrs.LIKE_EMAIL: cls.like_email,
attrs.IS_STOP: lambda string: False,
attrs.IS_OOV: lambda string: True
}
@classmethod
def default_dep_labels(cls):
return {0: {'ROOT': True}}
@classmethod
def default_ner_labels(cls):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')
@classmethod
def default_morphology(cls, data_dir):
return Morphology.from_dir(data_dir)
@classmethod
def default_vectors(cls, data_dir):
return None
@classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None):
if data_dir is None:
data_dir = cls.default_data_dir()
if vectors is None:
vectors = cls.default_vectors(data_dir)
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir)
return Vocab.from_dir(
path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr,
vectors=vectors)
@classmethod
def default_tokenizer(cls, vocab, data_dir):
if path.exists(data_dir):
return Tokenizer.from_dir(vocab, data_dir)
else:
return Tokenizer(vocab, {}, None, None, None)
@classmethod
def default_tagger(cls, vocab, data_dir):
if path.exists(data_dir):
return Tagger.from_dir(data_dir, vocab)
else:
return None
@classmethod
def default_parser(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
else:
return None
@classmethod
def default_entity(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
else:
return None
@classmethod
def default_matcher(cls, vocab, data_dir):
if path.exists(data_dir):
return Matcher.from_dir(data_dir, vocab)
else:
return None
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
parser=None, entity=None, matcher=None, serializer=None):
if data_dir is None:
data_dir = self.default_data_dir()
if vocab is None:
vocab = self.default_vocab(data_dir)
if tokenizer is None:
tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
if tagger is None:
tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
if entity is None:
entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
if parser is None:
parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
if matcher is None:
matcher = self.default_matcher(vocab, data_dir=data_dir)
self.vocab = vocab
self.tokenizer = tokenizer
self.tagger = tagger
self.parser = parser
self.entity = entity
self.matcher = matcher
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
Returns:
tokens (spacy.tokens.Doc):
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
"""
tokens = self.tokenizer(text)
if self.tagger and tag:
self.tagger(tokens)
if self.matcher and entity:
self.matcher(tokens)
if self.parser and parse:
self.parser(tokens)
if self.entity and entity:
self.entity(tokens)
return tokens
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, list(self.tagger.freqs[TAG].items())),
(DEP, list(self.parser.moves.freqs[DEP].items())),
(ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
(ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
(HEAD, list(self.parser.moves.freqs[HEAD].items()))]))