From 00a0dfcb59b7df4b8dafe0ac13ae7efaf7db380e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Jun 2015 00:54:13 +0200 Subject: [PATCH] * Avoid shipping the spacy.munge package --- setup.py | 2 +- spacy/gold.pyx | 28 +++++++++++++++++++++++++++- spacy/scorer.py | 2 +- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0a9c78da7..010cfa06b 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args): def run_setup(exts): setup( name='spacy', - packages=['spacy', 'spacy.en', 'spacy.syntax', "spacy.munge"], + packages=['spacy', 'spacy.en', 'spacy.syntax'], description="Industrial-strength NLP", author='Matthew Honnibal', author_email='honnibal@gmail.com', diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 82c48ff91..fe53fdb8a 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -7,10 +7,36 @@ import re import os from os import path -from spacy.munge.read_ner import tags_to_entities from libc.string cimport memset +def tags_to_entities(tags): + entities = [] + start = None + for i, tag in enumerate(tags): + if tag.startswith('O'): + # TODO: We shouldn't be getting these malformed inputs. Fix this. + if start is not None: + start = None + continue + elif tag == '-': + continue + elif tag.startswith('I'): + assert start is not None, tags[:i] + continue + if tag.startswith('U'): + entities.append((tag[2:], i, i)) + elif tag.startswith('B'): + start = i + elif tag.startswith('L'): + entities.append((tag[2:], start, i)) + start = None + else: + raise Exception(tag) + return entities + + + def align(cand_words, gold_words): cost, edit_path = _min_edit_path(cand_words, gold_words) alignment = [] diff --git a/spacy/scorer.py b/spacy/scorer.py index 28b7208c6..4c210656b 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,6 +1,6 @@ from __future__ import division -from spacy.munge.read_ner import tags_to_entities +from .gold import tags_to_entities class PRFScore(object):