* Add file to read ENAMEX ner data

2015-05-27 17:36:23 +02:00 · 2015-05-27 17:36:23 +02:00 · 6a1c91675e
parent ef1333cf89
commit 6a1c91675e
1 changed files with 113 additions and 0 deletions
--- a/spacy/munge/read_ner.py
+++ b/spacy/munge/read_ner.py
@ -0,0 +1,113 @@
+import os
+from os import path
+import re
+
+
+def split(text):
+    """Split an annotation file by sentence. Each sentence's annotation should
+    be a single string."""
+    return text.strip().split('\n')[1:-1]
+    
+
+def parse(string, strip_bad_periods=False):
+    """Given a sentence's annotation string, return a list of word strings,
+    and a list of named entities, where each entity is a (start, end, label)
+    triple."""
+    tokens = []
+    tags = []
+    open_tag = None
+    # Arbitrary corrections to promote alignment, and ensure that entities
+    # begin at a space. This allows us to treat entities as tokens, making it
+    # easier to return the list of entities.
+    string = string.replace('... .', '...')
+    string = string.replace('U.S.</ENAMEX> .', 'U.S.</ENAMEX>')
+    string = string.replace('Co.</ENAMEX> .', 'Co.</ENAMEX>')
+    string = string.replace('U.S. .', 'U.S.')
+    string = string.replace('<ENAMEX ', '<ENAMEX')
+    string = string.replace(' E_OFF="', 'E_OFF="')
+    string = string.replace(' S_OFF="', 'S_OFF="')
+    string = string.replace('units</ENAMEX>-<ENAMEX', 'units</ENAMEX> - <ENAMEX')
+    string = string.replace('<ENAMEXTYPE="PERSON"E_OFF="1">Paula</ENAMEX> Zahn', 'Paula Zahn')
+    string = string.replace('<ENAMEXTYPE="CARDINAL"><ENAMEXTYPE="CARDINAL">little</ENAMEX> drain</ENAMEX>', 'little drain')
+    for substr in string.strip().split():
+        substr = _fix_inner_entities(substr)
+        tokens.append(_get_text(substr))
+        try:
+            tag, open_tag = _get_tag(substr, open_tag)
+        except:
+            print string
+            raise
+        tags.append(tag)
+    return tokens, tags
+
+
+tag_re = re.compile(r'<ENAMEXTYPE="[^"]+">')
+def _fix_inner_entities(substr):
+    tags = tag_re.findall(substr)
+    if '</ENAMEX' in substr and not substr.endswith('</ENAMEX'):
+            substr = substr.replace('</ENAMEX>', '') + '</ENAMEX>'
+    if tags:
+        substr = tag_re.sub('', substr)
+        return tags[0] + substr
+    else:
+        return substr
+
+
+def _get_tag(substr, tag):
+    if substr.startswith('<'):
+        tag = substr.split('"')[1]
+        if substr.endswith('>'):
+            return 'U-' + tag, None
+        else:
+            return 'B-%s' % tag, tag
+    elif substr.endswith('>'):
+        return 'L-' + tag, None
+    elif tag is not None:
+        return 'I-' + tag, tag
+    else:
+        return 'O', None
+
+
+def _get_text(substr):
+    if substr.startswith('<'):
+        substr = substr.split('>', 1)[1]
+    if substr.endswith('>'):
+        substr = substr.split('<')[0]
+    return reform_string(substr)
+
+
+def tags_to_entities(tags):
+    entities = []
+    start = None
+    for i, tag in enumerate(tags):
+        if tag.startswith('O') or tag == '-':
+            assert not start
+            continue
+        elif tag.startswith('I'):
+            assert start is not None, tags
+            continue
+        if tag.startswith('U'):
+            entities.append((tag[2:], i, i))
+        elif tag.startswith('B'):
+            start = i
+        elif tag.startswith('L'):
+            entities.append((tag[2:], start, i))
+            start = None
+        else:
+            print tags
+            raise StandardError(tag)
+    return entities
+
+
+def reform_string(tok):
+    tok = tok.replace("``", '"')
+    tok = tok.replace("`", "'")
+    tok = tok.replace("''", '"')
+    tok = tok.replace('\\', '')
+    tok = tok.replace('-LCB-', '{')
+    tok = tok.replace('-RCB-', '}')
+    tok = tok.replace('-RRB-', ')')
+    tok = tok.replace('-LRB-', '(')
+    tok = tok.replace("'T-", "'T")
+    tok = tok.replace('-AMP-', '&')
+    return tok