spaCy/spacy/gold.pyx

from __future__ import unicode_literals, print_function

import numpy
import io
import json
import random
import re
import os
from os import path

from libc.string cimport memset

try:
    import ujson as json
except ImportError:
    import json

from .syntax import nonproj


def tags_to_entities(tags):
    entities = []
    start = None
    for i, tag in enumerate(tags):
        if tag.startswith('O'):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            continue
        elif tag == '-':
            continue
        elif tag.startswith('I'):
            assert start is not None, tags[:i]
            continue
        if tag.startswith('U'):
            entities.append((tag[2:], i, i))
        elif tag.startswith('B'):
            start = i
        elif tag.startswith('L'):
            entities.append((tag[2:], start, i))
            start = None
        else:
            raise Exception(tag)
    return entities


def align(cand_words, gold_words):
    cost, edit_path = _min_edit_path(cand_words, gold_words)
    alignment = []
    i_of_gold = 0
    for move in edit_path:
        if move == 'M':
            alignment.append(i_of_gold)
            i_of_gold += 1
        elif move == 'S':
            alignment.append(None)
            i_of_gold += 1
        elif move == 'D':
            alignment.append(None)
        elif move == 'I':
            i_of_gold += 1
        else:
            raise Exception(move)
    return alignment


punct_re = re.compile(r'\W')
def _min_edit_path(cand_words, gold_words):
    cdef:
        Pool mem
        int i, j, n_cand, n_gold
        int* curr_costs
        int* prev_costs

    # TODO: Fix this --- just do it properly, make the full edit matrix and
    # then walk back over it...
    # Preprocess inputs
    cand_words = [punct_re.sub('', w) for w in cand_words] 
    gold_words = [punct_re.sub('', w) for w in gold_words] 
    
    if cand_words == gold_words:
        return 0, ''.join(['M' for _ in gold_words])
    mem = Pool()
    n_cand = len(cand_words)
    n_gold = len(gold_words)
    # Levenshtein distance, except we need the history, and we may want different
    # costs.
    # Mark operations with a string, and score the history using _edit_cost.
    previous_row = []
    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
    for i in range(n_gold + 1):
        cell = ''
        for j in range(i):
            cell += 'I'
        previous_row.append('I' * i)
        prev_costs[i] = i
    for i, cand in enumerate(cand_words):
        current_row = ['D' * (i + 1)]
        curr_costs[0] = i+1
        for j, gold in enumerate(gold_words):
            if gold.lower() == cand.lower():
                s_cost = prev_costs[j]
                i_cost = curr_costs[j] + 1
                d_cost = prev_costs[j + 1] + 1
            else:
                s_cost = prev_costs[j] + 1
                i_cost = curr_costs[j] + 1
                d_cost = prev_costs[j + 1] + (1 if cand else 0)

            if s_cost <= i_cost and s_cost <= d_cost:
                best_cost = s_cost
                best_hist = previous_row[j] + ('M' if gold == cand else 'S')
            elif i_cost <= s_cost and i_cost <= d_cost:
                best_cost = i_cost
                best_hist = current_row[j] + 'I'
            else:
                best_cost = d_cost
                best_hist = previous_row[j + 1] + 'D'
            
            current_row.append(best_hist)
            curr_costs[j+1] = best_cost
        previous_row = current_row
        for j in range(len(gold_words) + 1):
            prev_costs[j] = curr_costs[j]
            curr_costs[j] = 0

    return prev_costs[n_gold], previous_row[-1]


def read_json_file(loc, docs_filter=None):
    if path.isdir(loc):
        for filename in os.listdir(loc):
            yield from read_json_file(path.join(loc, filename))
    else:
        with open(loc) as file_:
            docs = json.load(file_)
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
                continue
            paragraphs = []
            for paragraph in doc['paragraphs']:
                sents = []
                for sent in paragraph['sentences']:
                    words = []
                    ids = []
                    tags = []
                    heads = []
                    labels = []
                    ner = []
                    for i, token in enumerate(sent['tokens']):
                        words.append(token['orth'])
                        ids.append(i)
                        tags.append(token.get('tag','-'))
                        heads.append(token.get('head',0) + i)
                        labels.append(token.get('dep',''))
                        # Ensure ROOT label is case-insensitive
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
                        ner.append(token.get('ner', '-'))
                    sents.append((
                        (ids, words, tags, heads, labels, ner),
                        sent.get('brackets', [])))
                if sents:
                    yield (paragraph.get('raw', None), sents)


def _iob_to_biluo(tags):
    out = []
    curr_label = None
    tags = list(tags)
    while tags:
        out.extend(_consume_os(tags))
        out.extend(_consume_ent(tags))
    return out


def _consume_os(tags):
    while tags and tags[0] == 'O':
        yield tags.pop(0)


def _consume_ent(tags):
    if not tags:
        return []
    target = tags.pop(0).replace('B', 'I')
    length = 1
    while tags and tags[0] == target:
        length += 1
        tags.pop(0)
    label = target[2:]
    if length == 1:
        return ['U-' + label]
    else:
        start = 'B-' + label
        end = 'L-' + label
        middle = ['I-%s' % label for _ in range(1, length - 1)]
        return [start] + middle + [end]


cdef class GoldParse:
    def __init__(self, tokens, annot_tuples, make_projective=False):
        self.mem = Pool()
        self.loss = 0
        self.length = len(tokens)

        # These are filled by the tagger/parser/entity recogniser
        self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int))
        self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int))
        self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int))
        self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))

        self.tags = [None] * len(tokens)
        self.heads = [None] * len(tokens)
        self.labels = [''] * len(tokens)
        self.ner = ['-'] * len(tokens)

        self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1])
        self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens])

        self.orig_annot = list(zip(*annot_tuples))

        words = [w.orth_ for w in tokens]
        for i, gold_i in enumerate(self.cand_to_gold):
            if words[i].isspace():
                self.tags[i] = 'SP'
                self.heads[i] = None
                self.labels[i] = None
                self.ner[i] = 'O'
            if gold_i is None:
                pass
            else:
                self.tags[i] = annot_tuples[2][gold_i]
                self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]
                self.labels[i] = annot_tuples[4][gold_i]
                self.ner[i] = annot_tuples[5][gold_i]

        cycle = nonproj.contains_cycle(self.heads)
        if cycle != None:
            raise Exception("Cycle found: %s" % cycle)

        if make_projective:
            proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels)
            self.heads = proj_heads

    def __len__(self):
        return self.length

    @property
    def is_projective(self):
        return not nonproj.is_nonproj_tree(self.heads)


def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`from __future__ import unicode_literals, print_function`

* Tmp 2015-03-09 05:46:22 +00:00			`import numpy`
caught more codecs.open -> io.open 2015-09-30 18:20:09 +00:00			`import io`
* Add read_json_file to conll.pyx 2015-05-06 14:27:31 +00:00			`import json`
* Tmp commit. Working on whole document parsing 2015-05-24 00:49:56 +00:00			`import random`
* Add functions for Levenshtein distance alignment 2015-05-24 19:50:48 +00:00			`import re`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 01:52:55 +00:00			`import os`
			`from os import path`
* Hacks to conll.pyx. Should clean these up. 2015-03-08 05:14:48 +00:00
* Tmp 2015-03-09 05:46:22 +00:00			`from libc.string cimport memset`
* Hacks to conll.pyx. Should clean these up. 2015-03-08 05:14:48 +00:00
* Allow json to be used as a fallback if ujson is not available 2015-07-25 16:11:36 +00:00			`try:`
			`import ujson as json`
			`except ImportError:`
			`import json`

integrated pseudo-projective parsing into parser - nonproj.pyx holds a class PseudoProjectivity which currently holds all functionality to implement Nivre & Nilsson 2005's pseudo-projective parsing using the HEAD decoration scheme - changed lefts/rights in Token to account for possible non-projective structures 2016-03-01 09:09:08 +00:00			`from .syntax import nonproj`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 13:40:40 +00:00
* Add file to hold GoldParse class 2015-02-21 16:06:58 +00:00
* Avoid shipping the spacy.munge package 2015-06-07 22:54:13 +00:00			`def tags_to_entities(tags):`
			`entities = []`
			`start = None`
			`for i, tag in enumerate(tags):`
			`if tag.startswith('O'):`
			`# TODO: We shouldn't be getting these malformed inputs. Fix this.`
			`if start is not None:`
			`start = None`
			`continue`
			`elif tag == '-':`
			`continue`
			`elif tag.startswith('I'):`
			`assert start is not None, tags[:i]`
			`continue`
			`if tag.startswith('U'):`
			`entities.append((tag[2:], i, i))`
			`elif tag.startswith('B'):`
			`start = i`
			`elif tag.startswith('L'):`
			`entities.append((tag[2:], start, i))`
			`start = None`
			`else:`
			`raise Exception(tag)`
			`return entities`



* Add functions for Levenshtein distance alignment 2015-05-24 19:50:48 +00:00			`def align(cand_words, gold_words):`
			`cost, edit_path = _min_edit_path(cand_words, gold_words)`
			`alignment = []`
			`i_of_gold = 0`
			`for move in edit_path:`
			`if move == 'M':`
			`alignment.append(i_of_gold)`
			`i_of_gold += 1`
			`elif move == 'S':`
			`alignment.append(None)`
			`i_of_gold += 1`
			`elif move == 'D':`
			`alignment.append(None)`
			`elif move == 'I':`
			`i_of_gold += 1`
			`else:`
			`raise Exception(move)`
			`return alignment`


			`punct_re = re.compile(r'\W')`
			`def _min_edit_path(cand_words, gold_words):`
			`cdef:`
			`Pool mem`
			`int i, j, n_cand, n_gold`
			`int* curr_costs`
			`int* prev_costs`

			`# TODO: Fix this --- just do it properly, make the full edit matrix and`
			`# then walk back over it...`
			`# Preprocess inputs`
			`cand_words = [punct_re.sub('', w) for w in cand_words]`
			`gold_words = [punct_re.sub('', w) for w in gold_words]`
* Read input json in a streaming way 2015-05-27 17:13:11 +00:00
			`if cand_words == gold_words:`
* Fix output from _min_edit_path when inputs match. 2015-06-06 03:58:53 +00:00			`return 0, ''.join(['M' for _ in gold_words])`
* Read input json in a streaming way 2015-05-27 17:13:11 +00:00			`mem = Pool()`
* Add functions for Levenshtein distance alignment 2015-05-24 19:50:48 +00:00			`n_cand = len(cand_words)`
			`n_gold = len(gold_words)`
			`# Levenshtein distance, except we need the history, and we may want different`
			`# costs.`
			`# Mark operations with a string, and score the history using _edit_cost.`
			`previous_row = []`
			`prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))`
			`curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))`
			`for i in range(n_gold + 1):`
			`cell = ''`
			`for j in range(i):`
			`cell += 'I'`
			`previous_row.append('I' * i)`
			`prev_costs[i] = i`
			`for i, cand in enumerate(cand_words):`
			`current_row = ['D' * (i + 1)]`
			`curr_costs[0] = i+1`
			`for j, gold in enumerate(gold_words):`
			`if gold.lower() == cand.lower():`
			`s_cost = prev_costs[j]`
			`i_cost = curr_costs[j] + 1`
			`d_cost = prev_costs[j + 1] + 1`
			`else:`
			`s_cost = prev_costs[j] + 1`
			`i_cost = curr_costs[j] + 1`
			`d_cost = prev_costs[j + 1] + (1 if cand else 0)`

			`if s_cost <= i_cost and s_cost <= d_cost:`
			`best_cost = s_cost`
			`best_hist = previous_row[j] + ('M' if gold == cand else 'S')`
			`elif i_cost <= s_cost and i_cost <= d_cost:`
			`best_cost = i_cost`
			`best_hist = current_row[j] + 'I'`
			`else:`
			`best_cost = d_cost`
			`best_hist = previous_row[j + 1] + 'D'`

			`current_row.append(best_hist)`
			`curr_costs[j+1] = best_cost`
			`previous_row = current_row`
			`for j in range(len(gold_words) + 1):`
			`prev_costs[j] = curr_costs[j]`
			`curr_costs[j] = 0`

			`return prev_costs[n_gold], previous_row[-1]`

* Read input json in a streaming way 2015-05-27 17:13:11 +00:00
* Allow training documents to be filtered in gold.pyx 2015-06-12 00:42:08 +00:00			`def read_json_file(loc, docs_filter=None):`
* Read json files recursively from a directory, instead of requiring a single .json file 2015-05-29 01:52:55 +00:00			`if path.isdir(loc):`
			`for filename in os.listdir(loc):`
			`yield from read_json_file(path.join(loc, filename))`
			`else:`
			`with open(loc) as file_:`
* Allow json to be used as a fallback if ujson is not available 2015-07-25 16:11:36 +00:00			`docs = json.load(file_)`
* Fix efficiency of JSON reading, by using ujson instead of stream 2015-05-30 15:54:52 +00:00			`for doc in docs:`
* Allow training documents to be filtered in gold.pyx 2015-06-12 00:42:08 +00:00			`if docs_filter is not None and not docs_filter(doc):`
			`continue`
* Fix efficiency of JSON reading, by using ujson instead of stream 2015-05-30 15:54:52 +00:00			`paragraphs = []`
			`for paragraph in doc['paragraphs']:`
			`sents = []`
			`for sent in paragraph['sentences']:`
			`words = []`
			`ids = []`
			`tags = []`
			`heads = []`
			`labels = []`
			`ner = []`
			`for i, token in enumerate(sent['tokens']):`
			`words.append(token['orth'])`
			`ids.append(i)`
don't require json-files to have syntactic annotation 2016-04-22 14:32:27 +00:00			`tags.append(token.get('tag','-'))`
			`heads.append(token.get('head',0) + i)`
don't require read_json_file to expect particular annotations 2016-05-02 13:29:30 +00:00			`labels.append(token.get('dep',''))`
* Ensure root albel is spelled ROOT, for backwards compatibility 2015-06-23 02:14:03 +00:00			`# Ensure ROOT label is case-insensitive`
			`if labels[-1].lower() == 'root':`
			`labels[-1] = 'ROOT'`
* Fix efficiency of JSON reading, by using ujson instead of stream 2015-05-30 15:54:52 +00:00			`ner.append(token.get('ner', '-'))`
			`sents.append((`
			`(ids, words, tags, heads, labels, ner),`
			`sent.get('brackets', [])))`
			`if sents:`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`yield (paragraph.get('raw', None), sents)`
* Add read_json_file to conll.pyx 2015-05-06 14:27:31 +00:00

* Add read_conll03_file function to conll.pyx 2015-04-10 02:59:11 +00:00			`def _iob_to_biluo(tags):`
			`out = []`
			`curr_label = None`
			`tags = list(tags)`
			`while tags:`
			`out.extend(_consume_os(tags))`
			`out.extend(_consume_ent(tags))`
			`return out`


			`def _consume_os(tags):`
			`while tags and tags[0] == 'O':`
			`yield tags.pop(0)`


			`def _consume_ent(tags):`
			`if not tags:`
			`return []`
			`target = tags.pop(0).replace('B', 'I')`
			`length = 1`
			`while tags and tags[0] == target:`
			`length += 1`
			`tags.pop(0)`
			`label = target[2:]`
			`if length == 1:`
			`return ['U-' + label]`
			`else:`
			`start = 'B-' + label`
			`end = 'L-' + label`
			`middle = ['I-%s' % label for _ in range(1, length - 1)]`
			`return [start] + middle + [end]`


* Tmp 2015-03-09 05:46:22 +00:00			`cdef class GoldParse:`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 10:24:24 +00:00			`def __init__(self, tokens, annot_tuples, make_projective=False):`
* Tmp 2015-03-09 05:46:22 +00:00			`self.mem = Pool()`
			`self.loss = 0`
* Tmp commit 2015-02-23 19:04:53 +00:00			`self.length = len(tokens)`
* Tmp 2015-03-09 05:46:22 +00:00
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 11:06:01 +00:00			`# These are filled by the tagger/parser/entity recogniser`
* Have oracle functions take a struct instead of a Python object 2015-06-02 18:01:06 +00:00			`self.c.tags = <int*>self.mem.alloc(len(tokens), sizeof(int))`
			`self.c.heads = <int*>self.mem.alloc(len(tokens), sizeof(int))`
			`self.c.labels = <int*>self.mem.alloc(len(tokens), sizeof(int))`
			`self.c.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition))`
* Tmp 2015-03-09 05:46:22 +00:00
* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc. 2015-03-09 11:06:01 +00:00			`self.tags = [None] * len(tokens)`
* Tmp commit. Working on whole document parsing 2015-05-24 00:49:56 +00:00			`self.heads = [None] * len(tokens)`
			`self.labels = [''] * len(tokens)`
			`self.ner = ['-'] * len(tokens)`

* Remove cruft from conll.pyx --- unused stuff about evlauation, which now lives in spacy.scorer 2015-05-24 15:35:49 +00:00			`self.cand_to_gold = align([t.orth_ for t in tokens], annot_tuples[1])`
			`self.gold_to_cand = align(annot_tuples[1], [t.orth_ for t in tokens])`

* Python3 correction for GoldParse 2015-07-28 12:44:53 +00:00			`self.orig_annot = list(zip(*annot_tuples))`
* Tmp 2015-03-09 05:46:22 +00:00
* Fix space check in gold.pyx 2015-07-13 22:10:27 +00:00			`words = [w.orth_ for w in tokens]`
* Remove cruft from conll.pyx --- unused stuff about evlauation, which now lives in spacy.scorer 2015-05-24 15:35:49 +00:00			`for i, gold_i in enumerate(self.cand_to_gold):`
* Fix space check in gold.pyx 2015-07-13 22:10:27 +00:00			`if words[i].isspace():`
* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity 2015-07-09 11:30:41 +00:00			`self.tags[i] = 'SP'`
			`self.heads[i] = None`
			`self.labels[i] = None`
			`self.ner[i] = 'O'`
* Tmp commit. Working on whole document parsing 2015-05-24 00:49:56 +00:00			`if gold_i is None:`
* Tmp 2015-03-09 05:46:22 +00:00			`pass`
* Tmp commit. Working on whole document parsing 2015-05-24 00:49:56 +00:00			`else:`
			`self.tags[i] = annot_tuples[2][gold_i]`
* Remove cruft from conll.pyx --- unused stuff about evlauation, which now lives in spacy.scorer 2015-05-24 15:35:49 +00:00			`self.heads[i] = self.gold_to_cand[annot_tuples[3][gold_i]]`
* Tmp commit. Working on whole document parsing 2015-05-24 00:49:56 +00:00			`self.labels[i] = annot_tuples[4][gold_i]`
* Fix reading of NER in gold.pyx 2015-05-27 01:17:50 +00:00			`self.ner[i] = annot_tuples[5][gold_i]`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 13:40:40 +00:00
			`cycle = nonproj.contains_cycle(self.heads)`
			`if cycle != None:`
			`raise Exception("Cycle found: %s" % cycle)`

* Allow gold parse to cut non-projective arcs 2015-05-30 23:11:56 +00:00			`if make_projective:`
adjust train.py to train both english and german models 2016-03-03 14:21:00 +00:00			`proj_heads,_ = nonproj.PseudoProjectivity.projectivize(self.heads,self.labels)`
			`self.heads = proj_heads`
* Add cycle-checking code in gold.pyx 2015-06-22 22:02:22 +00:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring. 2015-03-10 17:00:23 +00:00			`def __len__(self):`
			`return self.length`
* Tmp 2015-03-09 05:46:22 +00:00
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00			`@property`
			`def is_projective(self):`
add class PseudoProjective for pseudo-projective parsing PseudoProjective() implements the algorithm from Nivre & Nilsson 2005 using their HEAD decoration scheme. 2016-02-24 10:26:25 +00:00			`return not nonproj.is_nonproj_tree(self.heads)`
* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag. 2015-05-29 23:25:46 +00:00
* Add file to hold GoldParse class 2015-02-21 16:06:58 +00:00
			`def is_punct_label(label):`
			`return label == 'P' or label.lower() == 'punct'`
replace tests for non-projectivity - add functions to find non-projective edges - add test file for non-projectivity functions 2016-02-22 13:40:40 +00:00