spaCy/spacy/ner/greedy_parser.pyx

from __future__ import division
from __future__ import unicode_literals

cimport cython
import random
import os
from os import path
import shutil
import json

from thinc.features cimport ConjFeat

from .context cimport fill_context
from .context cimport N_FIELDS
from .structs cimport Move, State
from .io_moves cimport fill_moves, transition, best_accepted
from .io_moves cimport set_accept_if_valid, set_accept_if_oracle
from .io_moves import get_n_moves
from ._state cimport init_state
from ._state cimport entity_is_open
from ._state cimport end_entity
from .annot cimport NERAnnotation


def setup_model_dir(entity_types, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'templates': templates,
        'entity_types': entity_types,
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
        json.dump(config, file_)


def train(train_sents, model_dir, nr_iter=10):
    cdef Tokens tokens
    cdef NERAnnotation gold_ner
    parser = NERParser(model_dir)
    for _ in range(nr_iter):
        tp = 0
        fp = 0
        fn = 0
        for i, (tokens, gold_ner) in enumerate(train_sents):
            #print [tokens[i].string for i in range(tokens.length)]
            test_ents = set(parser.train(tokens, gold_ner))
            #print 'Test', test_ents
            gold_ents = set(gold_ner.entities)
            #print 'Gold', set(gold_ner.entities)
            tp += len(gold_ents.intersection(test_ents))
            fp += len(test_ents - gold_ents)
            fn += len(gold_ents - test_ents)
        p = tp / (tp + fp)
        r = tp / (tp + fn)
        f = 2 * ((p * r) / (p + r))
        print 'P: %.3f' % p,
        print 'R: %.3f' % r,
        print 'F: %.3f' % f
        random.shuffle(train_sents)
    parser.model.end_training()
    parser.model.dump(path.join(model_dir, 'model'))


cdef class NERParser:
    def __init__(self, model_dir):
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
        self.entity_types = cfg['entity_types']
        self.n_classes = get_n_moves(len(self.entity_types))
        self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))
        fill_moves(self._moves, self.n_classes, self.entity_types)
        self.model = LinearModel(self.n_classes)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))

        self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))

    cpdef list train(self, Tokens tokens, NERAnnotation annot):
        cdef Pool mem = Pool()
        cdef State* s = init_state(mem, tokens.length)
        cdef Move* guess
        cdef Move* oracle_move
        n_correct = 0
        cdef int f = 0
        while s.i < tokens.length:
            fill_context(self._context, s, tokens)
            self.extractor.extract(self._feats, self._values, self._context, NULL)
            self.model.score(self._scores, self._feats, self._values)
            
            set_accept_if_valid(self._moves, self.n_classes, s)
            guess = best_accepted(self._moves, self._scores, self.n_classes)
            assert guess.clas != 0
            set_accept_if_oracle(self._moves, self.n_classes, s,
                                 annot.starts, annot.ends, annot.labels)
            oracle_move = best_accepted(self._moves, self._scores, self.n_classes)
            assert oracle_move.clas != 0
            if guess.clas == oracle_move.clas:
                counts = {}
                n_correct += 1
            else:
                counts = {guess.clas: {}, oracle_move.clas: {}}
                self.extractor.count(counts[oracle_move.clas], self._feats, 1)
                self.extractor.count(counts[guess.clas], self._feats, -1)
            self.model.update(counts)
            transition(s, guess)
            tokens.ner[s.i-1] = s.tags[s.i-1]
        if entity_is_open(s):
            s.curr.label = annot.labels[s.curr.start]
            end_entity(s)
        entities = []
        for i in range(s.j):
            entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label))
        return entities

    cpdef list set_tags(self, Tokens tokens):
        cdef Pool mem = Pool()
        cdef State* s = init_state(mem, tokens.length)
        cdef Move* move
        while s.i < tokens.length:
            fill_context(self._context, s, tokens)
            self.extractor.extract(self._feats, self._values, self._context, NULL)
            self.model.score(self._scores, self._feats, self._values)
            set_accept_if_valid(self._moves, self.n_classes, s)
            move = best_accepted(self._moves, self._scores, self.n_classes)
            transition(s, move)
            tokens.ner[s.i-1] = s.tags[s.i-1]
        if entity_is_open(s):
            s.curr.label = move.label
            end_entity(s)
        entities = []
        for i in range(s.j):
            entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label))
        return entities
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`from __future__ import division`
			`from __future__ import unicode_literals`

* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`cimport cython`
			`import random`
			`import os`
			`from os import path`
			`import shutil`
			`import json`

			`from thinc.features cimport ConjFeat`

* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`from .context cimport fill_context`
			`from .context cimport N_FIELDS`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`from .structs cimport Move, State`
			`from .io_moves cimport fill_moves, transition, best_accepted`
			`from .io_moves cimport set_accept_if_valid, set_accept_if_oracle`
			`from .io_moves import get_n_moves`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`from ._state cimport init_state`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`from ._state cimport entity_is_open`
			`from ._state cimport end_entity`
			`from .annot cimport NERAnnotation`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00

* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`def setup_model_dir(entity_types, templates, model_dir):`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`if path.exists(model_dir):`
			`shutil.rmtree(model_dir)`
			`os.mkdir(model_dir)`
			`config = {`
			`'templates': templates,`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`'entity_types': entity_types,`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`}`
			`with open(path.join(model_dir, 'config.json'), 'w') as file_:`
			`json.dump(config, file_)`


			`def train(train_sents, model_dir, nr_iter=10):`
			`cdef Tokens tokens`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`cdef NERAnnotation gold_ner`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`parser = NERParser(model_dir)`
			`for _ in range(nr_iter):`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`tp = 0`
			`fp = 0`
			`fn = 0`
			`for i, (tokens, gold_ner) in enumerate(train_sents):`
			`#print [tokens[i].string for i in range(tokens.length)]`
			`test_ents = set(parser.train(tokens, gold_ner))`
			`#print 'Test', test_ents`
			`gold_ents = set(gold_ner.entities)`
			`#print 'Gold', set(gold_ner.entities)`
			`tp += len(gold_ents.intersection(test_ents))`
			`fp += len(test_ents - gold_ents)`
			`fn += len(gold_ents - test_ents)`
			`p = tp / (tp + fp)`
			`r = tp / (tp + fn)`
			`f = 2 * ((p * r) / (p + r))`
			`print 'P: %.3f' % p,`
			`print 'R: %.3f' % r,`
			`print 'F: %.3f' % f`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`random.shuffle(train_sents)`
			`parser.model.end_training()`
			`parser.model.dump(path.join(model_dir, 'model'))`


* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`cdef class NERParser:`
			`def __init__(self, model_dir):`
			`self.mem = Pool()`
			`cfg = json.load(open(path.join(model_dir, 'config.json')))`
			`templates = cfg['templates']`
			`self.extractor = Extractor(templates, [ConjFeat] * len(templates))`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`self.entity_types = cfg['entity_types']`
			`self.n_classes = get_n_moves(len(self.entity_types))`
			`self._moves = <Move*>self.mem.alloc(self.n_classes, sizeof(Move))`
			`fill_moves(self._moves, self.n_classes, self.entity_types)`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`self.model = LinearModel(self.n_classes)`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`if path.exists(path.join(model_dir, 'model')):`
			`self.model.load(path.join(model_dir, 'model'))`

			`self._context = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))`
			`self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))`
			`self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))`
			`self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))`

* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`cpdef list train(self, Tokens tokens, NERAnnotation annot):`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`cdef Pool mem = Pool()`
			`cdef State* s = init_state(mem, tokens.length)`
			`cdef Move* guess`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`cdef Move* oracle_move`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`n_correct = 0`
			`cdef int f = 0`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`while s.i < tokens.length:`
* Move moves to bilou_moves. Refactor context, returning to the simpler giant-enum style 2014-11-11 13:54:25 +00:00			`fill_context(self._context, s, tokens)`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`self.extractor.extract(self._feats, self._values, self._context, NULL)`
			`self.model.score(self._scores, self._feats, self._values)`

			`set_accept_if_valid(self._moves, self.n_classes, s)`
			`guess = best_accepted(self._moves, self._scores, self.n_classes)`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`assert guess.clas != 0`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`set_accept_if_oracle(self._moves, self.n_classes, s,`
			`annot.starts, annot.ends, annot.labels)`
			`oracle_move = best_accepted(self._moves, self._scores, self.n_classes)`
			`assert oracle_move.clas != 0`
			`if guess.clas == oracle_move.clas:`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`counts = {}`
			`n_correct += 1`
			`else:`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`counts = {guess.clas: {}, oracle_move.clas: {}}`
			`self.extractor.count(counts[oracle_move.clas], self._feats, 1)`
* Fixed greedy NER parsing. With static oracle, replicates accuracy from tagger. 2014-11-11 06:17:54 +00:00			`self.extractor.count(counts[guess.clas], self._feats, -1)`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`self.model.update(counts)`
* Use the dynamic oracle 'follow' strategy 2014-11-11 10:11:17 +00:00			`transition(s, guess)`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`tokens.ner[s.i-1] = s.tags[s.i-1]`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`if entity_is_open(s):`
			`s.curr.label = annot.labels[s.curr.start]`
			`end_entity(s)`
			`entities = []`
			`for i in range(s.j):`
			`entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label))`
			`return entities`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`cpdef list set_tags(self, Tokens tokens):`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`cdef Pool mem = Pool()`
			`cdef State* s = init_state(mem, tokens.length)`
			`cdef Move* move`
			`while s.i < tokens.length:`
* Move moves to bilou_moves. Refactor context, returning to the simpler giant-enum style 2014-11-11 13:54:25 +00:00			`fill_context(self._context, s, tokens)`
* Work on shift-reduce NER 2014-11-10 05:28:56 +00:00			`self.extractor.extract(self._feats, self._values, self._context, NULL)`
			`self.model.score(self._scores, self._feats, self._values)`
			`set_accept_if_valid(self._moves, self.n_classes, s)`
			`move = best_accepted(self._moves, self._scores, self.n_classes)`
			`transition(s, move)`
			`tokens.ner[s.i-1] = s.tags[s.i-1]`
* IOB-style parsing working. Accuracy down from BILOU, form 87-88 to 85-86 2014-11-12 12:21:09 +00:00			`if entity_is_open(s):`
			`s.curr.label = move.label`
			`end_entity(s)`
			`entities = []`
			`for i in range(s.j):`
			`entities.append((s.ents[i].start, s.ents[i].end, s.ents[i].label))`
			`return entities`