mirror of https://github.com/explosion/spaCy.git
* In prepare_treebank, move ner into the token descriptions
This commit is contained in:
parent
61885aee76
commit
32ae2cdabe
|
@ -59,8 +59,9 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||||
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
_, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
|
||||||
if ner_sents is not None:
|
if ner_sents is not None:
|
||||||
_, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
|
_, ner = read_ner.parse(ner_sents[i], strip_bad_periods=True)
|
||||||
|
assert len(ner) == len(annot)
|
||||||
else:
|
else:
|
||||||
ner = None
|
ner = ['-' for _ in annot]
|
||||||
for token_id, token in enumerate(annot):
|
for token_id, token in enumerate(annot):
|
||||||
try:
|
try:
|
||||||
head = (token['head'] + offset) if token['head'] != -1 else -1
|
head = (token['head'] + offset) if token['head'] != -1 else -1
|
||||||
|
@ -69,16 +70,10 @@ def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text):
|
||||||
'orth': token['word'],
|
'orth': token['word'],
|
||||||
'tag': token['tag'],
|
'tag': token['tag'],
|
||||||
'head': head,
|
'head': head,
|
||||||
'dep': token['dep']})
|
'dep': token['dep'],
|
||||||
|
'ner': ner[token_id]})
|
||||||
except:
|
except:
|
||||||
raise
|
raise
|
||||||
if ner is not None:
|
|
||||||
for label, start, end in ner:
|
|
||||||
if start != end:
|
|
||||||
para['entities'].append({
|
|
||||||
'label': label,
|
|
||||||
'first': start + offset,
|
|
||||||
'last': (end-1) + offset})
|
|
||||||
for label, start, end in brackets:
|
for label, start, end in brackets:
|
||||||
if start != end:
|
if start != end:
|
||||||
para['brackets'].append({
|
para['brackets'].append({
|
||||||
|
|
Loading…
Reference in New Issue