mirror of https://github.com/explosion/spaCy.git
* Prepare English class for NER
This commit is contained in:
parent
f5830dc1c1
commit
220ce8bfed
|
@ -7,6 +7,7 @@ from ..vocab import Vocab
|
||||||
from ..tokenizer import Tokenizer
|
from ..tokenizer import Tokenizer
|
||||||
from ..syntax.parser import GreedyParser
|
from ..syntax.parser import GreedyParser
|
||||||
from ..syntax.arc_eager import ArcEager
|
from ..syntax.arc_eager import ArcEager
|
||||||
|
from ..syntax.ner import BiluoPushDown
|
||||||
from ..tokens import Tokens
|
from ..tokens import Tokens
|
||||||
from .pos import EnPosTagger
|
from .pos import EnPosTagger
|
||||||
from .pos import POS_TAGS
|
from .pos import POS_TAGS
|
||||||
|
@ -58,6 +59,7 @@ class English(object):
|
||||||
for later loading.
|
for later loading.
|
||||||
"""
|
"""
|
||||||
ParserTransitionSystem = ArcEager
|
ParserTransitionSystem = ArcEager
|
||||||
|
EntityTransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
def __init__(self, data_dir=''):
|
def __init__(self, data_dir=''):
|
||||||
if data_dir == '':
|
if data_dir == '':
|
||||||
|
@ -74,6 +76,7 @@ class English(object):
|
||||||
infix_re = None
|
infix_re = None
|
||||||
self.has_parser_model = False
|
self.has_parser_model = False
|
||||||
self.has_tagger_model = False
|
self.has_tagger_model = False
|
||||||
|
self.has_entity_model = False
|
||||||
else:
|
else:
|
||||||
tok_data_dir = path.join(data_dir, 'tokenizer')
|
tok_data_dir = path.join(data_dir, 'tokenizer')
|
||||||
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
||||||
|
@ -82,6 +85,7 @@ class English(object):
|
||||||
infix_re = re.compile(infix_re)
|
infix_re = re.compile(infix_re)
|
||||||
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
|
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
|
||||||
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
|
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
|
||||||
|
self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
|
||||||
|
|
||||||
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
||||||
suffix_re, infix_re,
|
suffix_re, infix_re,
|
||||||
|
@ -89,6 +93,7 @@ class English(object):
|
||||||
# These are lazy-loaded
|
# These are lazy-loaded
|
||||||
self._tagger = None
|
self._tagger = None
|
||||||
self._parser = None
|
self._parser = None
|
||||||
|
self._entity = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tagger(self):
|
def tagger(self):
|
||||||
|
@ -103,7 +108,15 @@ class English(object):
|
||||||
self.ParserTransitionSystem)
|
self.ParserTransitionSystem)
|
||||||
return self._parser
|
return self._parser
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=parse_if_model_present):
|
@property
|
||||||
|
def entity(self):
|
||||||
|
if self._entity is None:
|
||||||
|
self._entity = GreedyParser(path.join(self._data_dir, 'ner'),
|
||||||
|
self.EntityTransitionSystem)
|
||||||
|
return self._entity
|
||||||
|
|
||||||
|
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
||||||
|
entity=parse_if_model_present):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
|
|
||||||
|
@ -135,21 +148,39 @@ class English(object):
|
||||||
msg = ("Incompatible arguments: tag=False, parse=True"
|
msg = ("Incompatible arguments: tag=False, parse=True"
|
||||||
"Part-of-speech tags are required for parsing.")
|
"Part-of-speech tags are required for parsing.")
|
||||||
raise ValueError(msg)
|
raise ValueError(msg)
|
||||||
|
if entity == True and tag == False:
|
||||||
|
msg = ("Incompatible arguments: tag=False, entity=True"
|
||||||
|
"Part-of-speech tags are required for entity recognition.")
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
tokens = self.tokenizer(text)
|
tokens = self.tokenizer(text)
|
||||||
if parse == -1 and tag == False:
|
if parse == -1 and tag == False:
|
||||||
parse = False
|
parse = False
|
||||||
elif parse == -1 and not self.has_parser_model:
|
elif parse == -1 and not self.has_parser_model:
|
||||||
parse = False
|
parse = False
|
||||||
|
if entity == -1 and tag == False:
|
||||||
|
entity = False
|
||||||
|
elif entity == -1 and not self.has_entity_model:
|
||||||
|
entity = False
|
||||||
if tag and self.has_tagger_model:
|
if tag and self.has_tagger_model:
|
||||||
self.tagger(tokens)
|
self.tagger(tokens)
|
||||||
if parse == True and not self.has_parser_model:
|
if parse == True and not self.has_parser_model:
|
||||||
msg = ("Receive parse=True, but parser model not found.\n\n"
|
msg = ("Received parse=True, but parser model not found.\n\n"
|
||||||
"Run:\n"
|
"Run:\n"
|
||||||
"$ python -m spacy.en.download\n"
|
"$ python -m spacy.en.download\n"
|
||||||
"To install the model.")
|
"To install the model.")
|
||||||
raise IOError(msg)
|
raise IOError(msg)
|
||||||
|
if entity == True and not self.has_entity_model:
|
||||||
|
msg = ("Received entity=True, but entity model not found.\n\n"
|
||||||
|
"Run:\n"
|
||||||
|
"$ python -m spacy.en.download\n"
|
||||||
|
"To install the model.")
|
||||||
|
raise IOError(msg)
|
||||||
|
|
||||||
if parse and self.has_parser_model:
|
if parse and self.has_parser_model:
|
||||||
self.parser(tokens)
|
self.parser(tokens)
|
||||||
|
if entity and self.has_entity_model:
|
||||||
|
self.entity(tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
Loading…
Reference in New Issue