spaCy/spacy/language.py

from os import path

try:
    import ujson as json
except ImportError:
    import json

from .tokenizer import Tokenizer
from .morphology import Morphology
from .vocab import Vocab
from .syntax.parser import Parser
from .tagger import Tagger
from .matcher import Matcher
from .serialize.packer import Packer
from ._ml import Model
from . import attrs
from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager

from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD


class Language(object):
    @staticmethod
    def lower(string):
        return string.lower()

    @staticmethod
    def norm(string):
        return string

    @staticmethod
    def shape(string):
        return orth.word_shape(string)

    @staticmethod
    def prefix(string):
        return string[0]

    @staticmethod
    def suffix(string):
        return string[-3:]

    @staticmethod
    def prob(string):
        return -30

    @staticmethod
    def cluster(string):
        return 0

    @staticmethod
    def is_alpha(string):
        return orth.is_alpha(string)

    @staticmethod
    def is_ascii(string):
        return orth.is_ascii(string)

    @staticmethod
    def is_digit(string):
        return string.isdigit()

    @staticmethod
    def is_lower(string):
        return orth.is_lower(string)

    @staticmethod
    def is_punct(string):
        return orth.is_punct(string)

    @staticmethod
    def is_space(string):
        return string.isspace()

    @staticmethod
    def is_title(string):
        return orth.is_title(string)

    @staticmethod
    def is_upper(string):
        return orth.is_upper(string)

    @staticmethod
    def like_url(string):
        return orth.like_url(string)

    @staticmethod
    def like_number(string):
        return orth.like_number(string)

    @staticmethod
    def like_email(string):
        return orth.like_email(string)

    @classmethod
    def default_lex_attrs(cls, data_dir=None):
        return {
            attrs.LOWER: cls.lower,
            attrs.NORM: cls.norm,
            attrs.SHAPE: cls.shape,
            attrs.PREFIX: cls.prefix,
            attrs.SUFFIX: cls.suffix,
            attrs.CLUSTER: cls.cluster,
            attrs.PROB: lambda string: -10.0,

            attrs.IS_ALPHA: cls.is_alpha,
            attrs.IS_ASCII: cls.is_ascii,
            attrs.IS_DIGIT: cls.is_digit,
            attrs.IS_LOWER: cls.is_lower,
            attrs.IS_PUNCT: cls.is_punct,
            attrs.IS_SPACE: cls.is_space,
            attrs.IS_TITLE: cls.is_title,
            attrs.IS_UPPER: cls.is_upper,
            attrs.LIKE_URL: cls.like_url,
            attrs.LIKE_NUM: cls.like_number,
            attrs.LIKE_EMAIL: cls.like_email,
            attrs.IS_STOP: lambda string: False,
            attrs.IS_OOV: lambda string: True
        }

    @classmethod
    def default_dep_labels(cls):
        return {0: {'ROOT': True}}

    @classmethod
    def default_ner_labels(cls):
        return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}

    @classmethod
    def default_data_dir(cls):
        return path.join(path.dirname(__file__), 'data')

    @classmethod
    def default_morphology(cls, data_dir):
        return Morphology.from_dir(data_dir)

    @classmethod
    def default_vectors(cls, data_dir):
        return None

    @classmethod
    def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None):
        if data_dir is None:
            data_dir = cls.default_data_dir()
        if vectors is None:
            vectors = cls.default_vectors(data_dir)
        if get_lex_attr is None:
            get_lex_attr = cls.default_lex_attrs(data_dir)
        return Vocab.from_dir(
                path.join(data_dir, 'vocab'),
                get_lex_attr=get_lex_attr,
                vectors=vectors)

    @classmethod
    def default_tokenizer(cls, vocab, data_dir):
        if path.exists(data_dir):
            return Tokenizer.from_dir(vocab, data_dir)
        else:
            return Tokenizer(vocab, {}, None, None, None)

    @classmethod
    def default_tagger(cls, vocab, data_dir):
        if path.exists(data_dir):
            return Tagger.from_dir(data_dir, vocab)
        else:
            return None

    @classmethod
    def default_parser(cls, vocab, data_dir):
        if path.exists(data_dir):
            return Parser.from_dir(data_dir, vocab.strings, ArcEager)
        else:
            return None

    @classmethod
    def default_entity(cls, vocab, data_dir):
        if path.exists(data_dir):
            return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
        else:
            return None

    @classmethod
    def default_matcher(cls, vocab, data_dir):
        if path.exists(data_dir):
            return Matcher.from_dir(data_dir, vocab)
        else:
            return None

    def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
                 parser=None, entity=None, matcher=None, serializer=None):
        if data_dir is None:
            data_dir = self.default_data_dir()
        if vocab is None:
            vocab = self.default_vocab(data_dir)
        if tokenizer is None:
            tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
        if tagger is None:
            tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
        if entity is None:
            entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
        if parser is None:
            parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
        if matcher is None:
            matcher = self.default_matcher(vocab, data_dir=data_dir)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.tagger = tagger
        self.parser = parser
        self.entity = entity
        self.matcher = matcher

    def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
        """Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
        is preserved.

        Args:
            text (unicode): The text to be processed.

        Returns:
            tokens (spacy.tokens.Doc):

        >>> from spacy.en import English
        >>> nlp = English()
        >>> tokens = nlp('An example sentence. Another example sentence.')
        >>> tokens[0].orth_, tokens[0].head.tag_
        ('An', 'NN')
        """
        tokens = self.tokenizer(text)
        if self.tagger and tag:
            self.tagger(tokens)
        if self.matcher and entity:
            self.matcher(tokens)
        if self.parser and parse:
            self.parser(tokens)
        if self.entity and entity:
            self.entity(tokens)
        return tokens

    def end_training(self, data_dir=None):
        if data_dir is None:
            data_dir = self.data_dir
        self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
        self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
        self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))

        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
            file_.write(
                json.dumps([
                    (TAG, list(self.tagger.freqs[TAG].items())),
                    (DEP, list(self.parser.moves.freqs[DEP].items())),
                    (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())),
                    (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())),
                    (HEAD, list(self.parser.moves.freqs[HEAD].items()))]))