spaCy/spacy/munge/read_ner.py

117 lines
3.6 KiB
Python
Raw Normal View History

2015-06-07 22:35:04 +00:00
from __future__ import unicode_literals
2015-05-27 15:36:23 +00:00
import os
from os import path
import re
def split(text):
"""Split an annotation file by sentence. Each sentence's annotation should
be a single string."""
return text.strip().split('\n')[1:-1]
def parse(string, strip_bad_periods=False):
"""Given a sentence's annotation string, return a list of word strings,
and a list of named entities, where each entity is a (start, end, label)
triple."""
tokens = []
tags = []
open_tag = None
# Arbitrary corrections to promote alignment, and ensure that entities
# begin at a space. This allows us to treat entities as tokens, making it
# easier to return the list of entities.
string = string.replace('... .', '...')
string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
string = string.replace('U.S. .', 'U.S.')
string = string.replace('<ENAMEX ', '<ENAMEX')
string = string.replace(' E_OFF="', 'E_OFF="')
string = string.replace(' S_OFF="', 'S_OFF="')
string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
for substr in string.strip().split():
substr = _fix_inner_entities(substr)
tokens.append(_get_text(substr))
try:
tag, open_tag = _get_tag(substr, open_tag)
except:
raise
tags.append(tag)
return tokens, tags
tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
def _fix_inner_entities(substr):
tags = tag_re.findall(substr)
if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
if tags:
substr = tag_re.sub('', substr)
return tags[0] + substr
else:
return substr
def _get_tag(substr, tag):
if substr.startswith('<'):
tag = substr.split('"')[1]
if substr.endswith('>'):
return 'U-' + tag, None
else:
return 'B-%s' % tag, tag
elif substr.endswith('>'):
return 'L-' + tag, None
elif tag is not None:
return 'I-' + tag, tag
else:
return 'O', None
def _get_text(substr):
if substr.startswith('<'):
substr = substr.split('>', 1)[1]
if substr.endswith('>'):
substr = substr.split('<')[0]
return reform_string(substr)
def tags_to_entities(tags):
entities = []
start = None
for i, tag in enumerate(tags):
if tag.startswith('O'):
# TODO: We shouldn't be getting these malformed inputs. Fix this.
if start is not None:
start = None
continue
elif tag == '-':
2015-05-27 15:36:23 +00:00
continue
elif tag.startswith('I'):
assert start is not None, tags[:i]
2015-05-27 15:36:23 +00:00
continue
if tag.startswith('U'):
entities.append((tag[2:], i, i))
elif tag.startswith('B'):
start = i
elif tag.startswith('L'):
entities.append((tag[2:], start, i))
start = None
else:
2015-06-07 22:35:04 +00:00
raise Exception(tag)
2015-05-27 15:36:23 +00:00
return entities
def reform_string(tok):
tok = tok.replace("``", '"')
tok = tok.replace("`", "'")
tok = tok.replace("''", '"')
tok = tok.replace('\\', '')
tok = tok.replace('-LCB-', '{')
tok = tok.replace('-RCB-', '}')
tok = tok.replace('-RRB-', ')')
tok = tok.replace('-LRB-', '(')
tok = tok.replace("'T-", "'T")
tok = tok.replace('-AMP-', '&')
return tok