* Add simple ner_tag script

2015-04-03 17:26:58 +02:00 · 2015-04-03 17:26:58 +02:00 · f26f381b0e
parent bb27979352
commit f26f381b0e
1 changed files with 34 additions and 0 deletions
--- a/bin/ner_tag.py
+++ b/bin/ner_tag.py
@ -0,0 +1,34 @@
 import codecs
 import plac
 from spacy.en import English
 def main(text_loc):
    with codecs.open(text_loc, 'r', 'utf8') as file_:
        text = file_.read()
    NLU = English()
    for paragraph in text.split('\n\n'):
        tokens = NLU(paragraph)
        ent_starts = {}
        ent_ends = {}
        for span in tokens.ents:
            ent_starts[span.start] = span.label_
            ent_ends[span.end] = span.label_
        output = []
        for token in tokens:
            if token.i in ent_starts:
                output.append('<%s>' % ent_starts[token.i])
            output.append(token.orth_)
            if (token.i+1) in ent_ends:
                output.append('</%s>' % ent_ends[token.i+1])
        output.append('\n\n')
    print ' '.join(output)
 if __name__ == '__main__':
    plac.call(main)