Tidy up gold

2017-10-27 17:02:55 +02:00 · 2017-10-27 17:02:55 +02:00 · a6135336f5
parent 6a0483b7aa
commit a6135336f5
1 changed files with 43 additions and 36 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -54,7 +54,8 @@ def merge_sents(sents):
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
-        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
+        m_brackets.extend((b['first'] + i, b['last'] + i, b['label'])
                          for b in brackets)
        i += len(ids)
    return [(m_deps, m_brackets)]
@ -80,6 +81,8 @@ def align(cand_words, gold_words):
 punct_re = re.compile(r'\W')
 def _min_edit_path(cand_words, gold_words):
    cdef:
        Pool mem
@ -98,9 +101,9 @@ def _min_edit_path(cand_words, gold_words):
    mem = Pool()
    n_cand = len(cand_words)
    n_gold = len(gold_words)
-    # Levenshtein distance, except we need the history, and we may want different
+    # Levenshtein distance, except we need the history, and we may want
-    # costs.
+    # different costs. Mark operations with a string, and score the history
-    # Mark operations with a string, and score the history using _edit_cost.
+    # using _edit_cost.
    previous_row = []
    prev_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
    curr_costs = <int*>mem.alloc(n_gold + 1, sizeof(int))
@ -144,9 +147,9 @@ def _min_edit_path(cand_words, gold_words):
 def minibatch(items, size=8):
-    '''Iterate over batches of items. `size` may be an iterator,
+    """Iterate over batches of items. `size` may be an iterator,
    so that batch-size can vary on each step.
-    '''
+    """
    if isinstance(size, int):
        size_ = itertools.repeat(8)
    else:
@ -168,6 +171,7 @@ class GoldCorpus(object):
        train_path (unicode or Path): File or directory of training data.
        dev_path (unicode or Path): File or directory of development data.
        RETURNS (GoldCorpus): The newly created object.
        """
        self.train_path = util.ensure_path(train_path)
        self.dev_path = util.ensure_path(dev_path)
@ -222,7 +226,6 @@ class GoldCorpus(object):
    def dev_docs(self, nlp, gold_preproc=False):
        gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
        #gold_docs = nlp.preprocess_gold(gold_docs)
        yield from gold_docs
    @classmethod
@ -233,7 +236,6 @@ class GoldCorpus(object):
                raw_text = None
            else:
                paragraph_tuples = merge_sents(paragraph_tuples)
            docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
                                  gold_preproc, noise_level=noise_level)
            golds = cls._make_golds(docs, paragraph_tuples)
@ -248,17 +250,20 @@ class GoldCorpus(object):
            raw_text = add_noise(raw_text, noise_level)
            return [nlp.make_doc(raw_text)]
        else:
-            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
+            return [Doc(nlp.vocab,
                        words=add_noise(sent_tuples[1], noise_level))
                    for (sent_tuples, brackets) in paragraph_tuples]
    @classmethod
    def _make_golds(cls, docs, paragraph_tuples):
        assert len(docs) == len(paragraph_tuples)
        if len(docs) == 1:
-            return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0])]
+            return [GoldParse.from_annot_tuples(docs[0],
                                                paragraph_tuples[0][0])]
        else:
            return [GoldParse.from_annot_tuples(doc, sent_tuples)
-                    for doc, (sent_tuples, brackets) in zip(docs, paragraph_tuples)]
+                    for doc, (sent_tuples, brackets)
                    in zip(docs, paragraph_tuples)]
    @staticmethod
    def walk_corpus(path):
@ -330,9 +335,9 @@ def read_json_file(loc, docs_filter=None, limit=None):
                    for i, token in enumerate(sent['tokens']):
                        words.append(token['orth'])
                        ids.append(i)
-                        tags.append(token.get('tag','-'))
+                        tags.append(token.get('tag', '-'))
-                        heads.append(token.get('head',0) + i)
+                        heads.append(token.get('head', 0) + i)
-                        labels.append(token.get('dep',''))
+                        labels.append(token.get('dep', ''))
                        # Ensure ROOT label is case-insensitive
                        if labels[-1].lower() == 'root':
                            labels[-1] = 'ROOT'
@ -382,19 +387,21 @@ cdef class GoldParse:
    @classmethod
    def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
        _, words, tags, heads, deps, entities = annot_tuples
-        return cls(doc, words=words, tags=tags, heads=heads, deps=deps, entities=entities,
+        return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
-                   make_projective=make_projective)
+                   entities=entities, make_projective=make_projective)
-    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
+    def __init__(self, doc, annot_tuples=None, words=None, tags=None,
-                 deps=None, entities=None, make_projective=False,
+                 heads=None, deps=None, entities=None, make_projective=False,
                 cats=None):
        """Create a GoldParse.
        doc (Doc): The document the annotations refer to.
        words (iterable): A sequence of unicode word strings.
        tags (iterable): A sequence of strings, representing tag annotations.
-        heads (iterable): A sequence of integers, representing syntactic head offsets.
+        heads (iterable): A sequence of integers, representing syntactic
-        deps (iterable): A sequence of strings, representing the syntactic relation types.
+            head offsets.
        deps (iterable): A sequence of strings, representing the syntactic
            relation types.
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
@ -404,9 +411,10 @@ cdef class GoldParse:
            document (usually a sentence). Unlike entity annotations, label
            annotations can overlap, i.e. a single word can be covered by
            multiple labelled spans. The TextCategorizer component expects
-            true examples of a label to have the value 1.0, and negative examples
+            true examples of a label to have the value 1.0, and negative
-            of a label to have the value 0.0. Labels not in the dictionary are
+            examples of a label to have the value 0.0. Labels not in the
-            treated as missing -- the gradient for those labels will be zero.
+            dictionary are treated as missing - the gradient for those labels
            will be zero.
        RETURNS (GoldParse): The newly constructed object.
        """
        if words is None:
@ -470,11 +478,11 @@ cdef class GoldParse:
                self.ner[i] = entities[gold_i]
        cycle = nonproj.contains_cycle(self.heads)
-        if cycle != None:
+        if cycle is not None:
            raise Exception("Cycle found: %s" % cycle)
        if make_projective:
-            proj_heads,_ = nonproj.projectivize(self.heads, self.labels)
+            proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
            self.heads = proj_heads
    def __len__(self):
@ -497,20 +505,19 @@ cdef class GoldParse:
 def biluo_tags_from_offsets(doc, entities, missing='O'):
-    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    """Encode labelled spans into per-token tags, using the
-    scheme (BILUO).
+    Begin/In/Last/Unit/Out scheme (BILUO).
    doc (Doc): The document that the entity offsets refer to. The output tags
        will refer to the token boundaries within the document.
-    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
+    entities (iterable): A sequence of `(start, end, label)` triples. `start`
-        `end` should be character-offset integers denoting the slice into the
+        and `end` should be character-offset integers denoting the slice into
-        original string.
+        the original string.
    RETURNS (list): A list of unicode strings, describing the tags. Each tag
        string will be of the form either "", "O" or "{action}-{label}", where
        action is one of "B", "I", "L", "U". The string "-" is used where the
-        entity offsets don't align with the tokenization in the `Doc` object. The
+        entity offsets don't align with the tokenization in the `Doc` object.
-        training algorithm will view these as missing values. "O" denotes a
+        The training algorithm will view these as missing values. "O" denotes a
        non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.