mirror of https://github.com/explosion/spaCy.git
* Add file to read ENAMEX ner data
This commit is contained in:
parent
ef1333cf89
commit
6a1c91675e
|
@ -0,0 +1,113 @@
|
|||
import os
|
||||
from os import path
|
||||
import re
|
||||
|
||||
|
||||
def split(text):
|
||||
"""Split an annotation file by sentence. Each sentence's annotation should
|
||||
be a single string."""
|
||||
return text.strip().split('\n')[1:-1]
|
||||
|
||||
|
||||
def parse(string, strip_bad_periods=False):
|
||||
"""Given a sentence's annotation string, return a list of word strings,
|
||||
and a list of named entities, where each entity is a (start, end, label)
|
||||
triple."""
|
||||
tokens = []
|
||||
tags = []
|
||||
open_tag = None
|
||||
# Arbitrary corrections to promote alignment, and ensure that entities
|
||||
# begin at a space. This allows us to treat entities as tokens, making it
|
||||
# easier to return the list of entities.
|
||||
string = string.replace('... .', '...')
|
||||
string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
|
||||
string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
|
||||
string = string.replace('U.S. .', 'U.S.')
|
||||
string = string.replace('<ENAMEX ', '<ENAMEX')
|
||||
string = string.replace(' E_OFF="', 'E_OFF="')
|
||||
string = string.replace(' S_OFF="', 'S_OFF="')
|
||||
string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
|
||||
string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
|
||||
string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
|
||||
for substr in string.strip().split():
|
||||
substr = _fix_inner_entities(substr)
|
||||
tokens.append(_get_text(substr))
|
||||
try:
|
||||
tag, open_tag = _get_tag(substr, open_tag)
|
||||
except:
|
||||
print string
|
||||
raise
|
||||
tags.append(tag)
|
||||
return tokens, tags
|
||||
|
||||
|
||||
tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
|
||||
def _fix_inner_entities(substr):
|
||||
tags = tag_re.findall(substr)
|
||||
if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
|
||||
substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
|
||||
if tags:
|
||||
substr = tag_re.sub('', substr)
|
||||
return tags[0] + substr
|
||||
else:
|
||||
return substr
|
||||
|
||||
|
||||
def _get_tag(substr, tag):
|
||||
if substr.startswith('<'):
|
||||
tag = substr.split('"')[1]
|
||||
if substr.endswith('>'):
|
||||
return 'U-' + tag, None
|
||||
else:
|
||||
return 'B-%s' % tag, tag
|
||||
elif substr.endswith('>'):
|
||||
return 'L-' + tag, None
|
||||
elif tag is not None:
|
||||
return 'I-' + tag, tag
|
||||
else:
|
||||
return 'O', None
|
||||
|
||||
|
||||
def _get_text(substr):
|
||||
if substr.startswith('<'):
|
||||
substr = substr.split('>', 1)[1]
|
||||
if substr.endswith('>'):
|
||||
substr = substr.split('<')[0]
|
||||
return reform_string(substr)
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
entities = []
|
||||
start = None
|
||||
for i, tag in enumerate(tags):
|
||||
if tag.startswith('O') or tag == '-':
|
||||
assert not start
|
||||
continue
|
||||
elif tag.startswith('I'):
|
||||
assert start is not None, tags
|
||||
continue
|
||||
if tag.startswith('U'):
|
||||
entities.append((tag[2:], i, i))
|
||||
elif tag.startswith('B'):
|
||||
start = i
|
||||
elif tag.startswith('L'):
|
||||
entities.append((tag[2:], start, i))
|
||||
start = None
|
||||
else:
|
||||
print tags
|
||||
raise StandardError(tag)
|
||||
return entities
|
||||
|
||||
|
||||
def reform_string(tok):
|
||||
tok = tok.replace("``", '"')
|
||||
tok = tok.replace("`", "'")
|
||||
tok = tok.replace("''", '"')
|
||||
tok = tok.replace('\\', '')
|
||||
tok = tok.replace('-LCB-', '{')
|
||||
tok = tok.replace('-RCB-', '}')
|
||||
tok = tok.replace('-RRB-', ')')
|
||||
tok = tok.replace('-LRB-', '(')
|
||||
tok = tok.replace("'T-", "'T")
|
||||
tok = tok.replace('-AMP-', '&')
|
||||
return tok
|
Loading…
Reference in New Issue