2014-12-21 20:25:43 +00:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
from os import path
|
2015-01-17 05:21:17 +00:00
|
|
|
import re
|
2014-12-21 20:25:43 +00:00
|
|
|
|
2015-01-11 23:26:22 +00:00
|
|
|
from .. import orth
|
2014-12-21 20:25:43 +00:00
|
|
|
from ..vocab import Vocab
|
|
|
|
from ..tokenizer import Tokenizer
|
2015-06-01 22:28:02 +00:00
|
|
|
from ..syntax.parser import Parser
|
2015-02-22 05:32:33 +00:00
|
|
|
from ..syntax.arc_eager import ArcEager
|
2015-03-08 23:04:00 +00:00
|
|
|
from ..syntax.ner import BiluoPushDown
|
2014-12-21 20:25:43 +00:00
|
|
|
from ..tokens import Tokens
|
2015-04-07 02:02:32 +00:00
|
|
|
from ..multi_words import RegexMerger
|
|
|
|
|
2014-12-21 20:25:43 +00:00
|
|
|
from .pos import EnPosTagger
|
2014-12-21 21:54:47 +00:00
|
|
|
from .pos import POS_TAGS
|
2014-12-21 20:25:43 +00:00
|
|
|
from .attrs import get_flags
|
2015-04-07 02:02:32 +00:00
|
|
|
from . import regexes
|
2014-12-21 20:25:43 +00:00
|
|
|
|
|
|
|
|
2015-01-17 05:21:17 +00:00
|
|
|
from ..util import read_lang_data
|
|
|
|
|
|
|
|
|
2014-12-21 20:25:43 +00:00
|
|
|
def get_lex_props(string):
|
2015-01-14 13:33:16 +00:00
|
|
|
return {
|
|
|
|
'flags': get_flags(string),
|
|
|
|
'length': len(string),
|
2015-01-22 15:08:25 +00:00
|
|
|
'orth': string,
|
2015-01-23 19:17:03 +00:00
|
|
|
'lower': string.lower(),
|
|
|
|
'norm': string,
|
2015-01-14 13:33:16 +00:00
|
|
|
'shape': orth.word_shape(string),
|
|
|
|
'prefix': string[0],
|
|
|
|
'suffix': string[-3:],
|
|
|
|
'cluster': 0,
|
|
|
|
'prob': 0,
|
|
|
|
'sentiment': 0
|
|
|
|
}
|
|
|
|
|
2014-12-21 20:25:43 +00:00
|
|
|
|
2015-01-11 23:26:22 +00:00
|
|
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
2014-12-21 20:25:43 +00:00
|
|
|
|
2015-01-26 15:45:21 +00:00
|
|
|
parse_if_model_present = -1
|
|
|
|
|
2015-01-14 13:33:16 +00:00
|
|
|
|
2014-12-21 20:25:43 +00:00
|
|
|
class English(object):
|
2014-12-27 07:45:16 +00:00
|
|
|
"""The English NLP pipeline.
|
|
|
|
|
2015-07-07 12:00:07 +00:00
|
|
|
Example:
|
|
|
|
|
|
|
|
Load data from default directory:
|
|
|
|
|
|
|
|
>>> nlp = English()
|
|
|
|
>>> nlp = English(data_dir=u'')
|
|
|
|
|
|
|
|
Load data from specified directory:
|
|
|
|
|
|
|
|
>>> nlp = English(data_dir=u'path/to/data_directory')
|
|
|
|
|
|
|
|
Disable (and avoid loading) parts of the processing pipeline:
|
|
|
|
|
|
|
|
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
|
|
|
|
|
|
|
|
Start with nothing loaded:
|
|
|
|
|
|
|
|
>>> nlp = English(data_dir=None)
|
2014-12-27 07:45:16 +00:00
|
|
|
|
|
|
|
Keyword args:
|
2015-02-11 20:13:20 +00:00
|
|
|
data_dir (unicode):
|
2015-07-07 12:00:07 +00:00
|
|
|
A path to a directory from which to load the pipeline;
|
|
|
|
or '', to load default; or None, to load nothing.
|
|
|
|
|
|
|
|
Tokenizer (bool or callable):
|
|
|
|
desc
|
2014-12-27 07:45:16 +00:00
|
|
|
|
2015-07-07 12:00:07 +00:00
|
|
|
Vectors (bool or callable):
|
|
|
|
desc
|
2014-12-27 07:45:16 +00:00
|
|
|
|
2015-07-07 12:00:07 +00:00
|
|
|
Parser (bool or callable):
|
|
|
|
desc
|
2015-01-31 05:38:27 +00:00
|
|
|
|
2015-07-07 12:00:07 +00:00
|
|
|
Tagger (bool or callable):
|
|
|
|
desc
|
|
|
|
|
|
|
|
Entity (bool or callable):
|
|
|
|
desc
|
|
|
|
|
|
|
|
Senser (bool or callable):
|
|
|
|
desc
|
2014-12-27 07:45:16 +00:00
|
|
|
"""
|
2015-02-22 05:32:33 +00:00
|
|
|
ParserTransitionSystem = ArcEager
|
2015-03-08 23:04:00 +00:00
|
|
|
EntityTransitionSystem = BiluoPushDown
|
2015-02-22 05:32:33 +00:00
|
|
|
|
2015-07-07 12:00:07 +00:00
|
|
|
def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
|
|
|
|
Tagger=True, Entity=True, Senser=True, load_vectors=True):
|
2015-01-31 05:38:27 +00:00
|
|
|
if data_dir == '':
|
|
|
|
data_dir = LOCAL_DATA_DIR
|
2015-07-07 12:00:07 +00:00
|
|
|
# TODO: Deprecation warning
|
|
|
|
if load_vectors is False:
|
|
|
|
vectors = False
|
|
|
|
|
2015-01-13 13:03:48 +00:00
|
|
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
2015-07-07 12:00:07 +00:00
|
|
|
get_lex_props=get_lex_props, vectors=Vectors)
|
|
|
|
|
|
|
|
if Tokenizer is True:
|
|
|
|
Tokenizer = tokenizer.Tokenizer
|
|
|
|
if Tagger is True:
|
|
|
|
Tagger = pos.EnPosTagger
|
|
|
|
if Parser is True:
|
|
|
|
transition_system = self.ParserTransitionSystem
|
|
|
|
Parser = lambda s, d: parser.Parser(s, d, transition_system
|
|
|
|
if Entity is True:
|
|
|
|
transition_system = self.EntityTransitionSystem
|
|
|
|
Entity = lambda s, d: parser.Parser(s, d, transition_system)
|
|
|
|
if Senser is True:
|
|
|
|
Senser = wsd.SuperSenseTagger
|
|
|
|
|
|
|
|
self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
|
|
|
|
self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
|
|
|
|
self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
|
|
|
|
self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
|
|
|
|
self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
|
|
|
|
|
|
|
|
self._data_dir = data_dir
|
2014-12-30 12:25:09 +00:00
|
|
|
tag_names = list(POS_TAGS.keys())
|
|
|
|
tag_names.sort()
|
2015-01-11 23:26:22 +00:00
|
|
|
if data_dir is None:
|
2015-01-17 05:21:17 +00:00
|
|
|
tok_rules = {}
|
|
|
|
prefix_re = None
|
|
|
|
suffix_re = None
|
|
|
|
infix_re = None
|
2015-01-11 23:26:22 +00:00
|
|
|
else:
|
2015-01-17 05:21:17 +00:00
|
|
|
tok_data_dir = path.join(data_dir, 'tokenizer')
|
|
|
|
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
2015-01-21 07:27:31 +00:00
|
|
|
prefix_re = re.compile(prefix_re)
|
|
|
|
suffix_re = re.compile(suffix_re)
|
|
|
|
infix_re = re.compile(infix_re)
|
2015-01-31 05:38:27 +00:00
|
|
|
|
2015-01-21 07:27:31 +00:00
|
|
|
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
|
|
|
suffix_re, infix_re,
|
2015-01-17 05:21:17 +00:00
|
|
|
POS_TAGS, tag_names)
|
2015-07-07 12:00:07 +00:00
|
|
|
|
2015-04-07 02:02:32 +00:00
|
|
|
self.mwe_merger = RegexMerger([
|
|
|
|
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
|
|
|
('CD', 'TIME', regexes.TIME_RE),
|
|
|
|
('NNP', 'DATE', regexes.DAYS_RE),
|
|
|
|
('CD', 'MONEY', regexes.MONEY_RE)])
|
2015-03-08 23:04:00 +00:00
|
|
|
|
|
|
|
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
2015-04-16 02:20:31 +00:00
|
|
|
entity=parse_if_model_present, merge_mwes=False):
|
2015-01-26 15:45:21 +00:00
|
|
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
|
|
|
and can contain arbtrary whitespace. Alignment into the original string
|
2015-04-19 08:31:31 +00:00
|
|
|
|
2015-01-26 15:45:21 +00:00
|
|
|
The tagger and parser are lazy-loaded the first time they are required.
|
|
|
|
Loading the parser model usually takes 5-10 seconds.
|
2015-04-19 08:31:31 +00:00
|
|
|
|
2014-12-27 07:45:16 +00:00
|
|
|
Args:
|
|
|
|
text (unicode): The text to be processed.
|
|
|
|
|
|
|
|
Keyword args:
|
2015-01-26 15:45:21 +00:00
|
|
|
tag (bool): Whether to add part-of-speech tags to the text. Also
|
|
|
|
sets morphological analysis and lemmas.
|
2015-04-19 08:31:31 +00:00
|
|
|
|
2015-01-26 15:45:21 +00:00
|
|
|
parse (True, False, -1): Whether to add labelled syntactic dependencies.
|
2015-04-19 08:31:31 +00:00
|
|
|
|
2015-01-26 15:45:21 +00:00
|
|
|
-1 (default) is "guess": It will guess True if tag=True and the
|
|
|
|
model has been installed.
|
2014-12-27 07:45:16 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
tokens (spacy.tokens.Tokens):
|
2015-01-26 15:45:21 +00:00
|
|
|
|
|
|
|
>>> from spacy.en import English
|
|
|
|
>>> nlp = English()
|
|
|
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
|
|
|
>>> tokens[0].orth_, tokens[0].head.tag_
|
|
|
|
('An', 'NN')
|
2014-12-27 07:45:16 +00:00
|
|
|
"""
|
2015-01-26 15:45:21 +00:00
|
|
|
if parse == True and tag == False:
|
|
|
|
msg = ("Incompatible arguments: tag=False, parse=True"
|
|
|
|
"Part-of-speech tags are required for parsing.")
|
|
|
|
raise ValueError(msg)
|
2015-03-08 23:04:00 +00:00
|
|
|
if entity == True and tag == False:
|
|
|
|
msg = ("Incompatible arguments: tag=False, entity=True"
|
|
|
|
"Part-of-speech tags are required for entity recognition.")
|
|
|
|
raise ValueError(msg)
|
|
|
|
|
2015-01-17 05:21:17 +00:00
|
|
|
tokens = self.tokenizer(text)
|
2015-01-26 15:45:21 +00:00
|
|
|
if parse == -1 and tag == False:
|
|
|
|
parse = False
|
|
|
|
elif parse == -1 and not self.has_parser_model:
|
|
|
|
parse = False
|
2015-03-08 23:04:00 +00:00
|
|
|
if entity == -1 and tag == False:
|
|
|
|
entity = False
|
|
|
|
elif entity == -1 and not self.has_entity_model:
|
|
|
|
entity = False
|
2015-01-26 15:45:21 +00:00
|
|
|
if tag and self.has_tagger_model:
|
2014-12-23 00:40:32 +00:00
|
|
|
self.tagger(tokens)
|
2015-01-26 15:45:21 +00:00
|
|
|
if parse == True and not self.has_parser_model:
|
2015-03-08 23:04:00 +00:00
|
|
|
msg = ("Received parse=True, but parser model not found.\n\n"
|
|
|
|
"Run:\n"
|
|
|
|
"$ python -m spacy.en.download\n"
|
|
|
|
"To install the model.")
|
|
|
|
raise IOError(msg)
|
|
|
|
if entity == True and not self.has_entity_model:
|
|
|
|
msg = ("Received entity=True, but entity model not found.\n\n"
|
2015-01-26 15:45:21 +00:00
|
|
|
"Run:\n"
|
|
|
|
"$ python -m spacy.en.download\n"
|
|
|
|
"To install the model.")
|
|
|
|
raise IOError(msg)
|
2015-03-08 23:04:00 +00:00
|
|
|
|
2015-01-25 03:47:38 +00:00
|
|
|
if parse and self.has_parser_model:
|
2015-01-17 05:21:17 +00:00
|
|
|
self.parser(tokens)
|
2015-03-08 23:04:00 +00:00
|
|
|
if entity and self.has_entity_model:
|
|
|
|
self.entity(tokens)
|
2015-04-07 02:02:32 +00:00
|
|
|
if merge_mwes and self.mwe_merger is not None:
|
|
|
|
self.mwe_merger(tokens)
|
2014-12-21 20:25:43 +00:00
|
|
|
return tokens
|
2014-12-24 06:42:00 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def tags(self):
|
2014-12-27 07:45:16 +00:00
|
|
|
"""List of part-of-speech tag names."""
|
2014-12-31 08:40:59 +00:00
|
|
|
return self.tagger.tag_names
|