From b67697a97bfd78347b30d3974d81e56bc7137ffa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 17 Oct 2016 14:02:13 +0200 Subject: [PATCH] Improve API for doc.merge() and span.merge(), to use keyword arguments. --- spacy/tokens/doc.pyx | 22 +++++++++++++++++++--- spacy/tokens/span.pyx | 4 ++-- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5642dc624..95dd392fa 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -593,9 +593,22 @@ cdef class Doc: keep_reading = False yield n_bytes_str + data - def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, - unicode ent_type): + def merge(self, int start_idx, int end_idx, *args, **attributes): """Merge a multi-word expression into a single token.""" + cdef unicode tag, lemma, ent_type + if len(args) == 3: + # TODO: Warn deprecation + tag, lemma, ent_type = args + attributes[TAG] = self.strings[tag] + attributes[LEMMA] = self.strings[lemma] + attributes[ENT_TYPE] = self.strings[ent_type] + elif args: + raise ValueError( + "Doc.merge received %d non-keyword arguments. " + "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). " + "Arguments supplied:\n%s\n" + "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) + cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -604,8 +617,11 @@ cdef class Doc: return None # Currently we have the token index, we want the range-end index end += 1 - cdef Span span = self[start:end] + tag = self.strings[attributes.get(TAG, span.root.tag)] + lemma = self.strings[attributes.get(LEMMA, span.root.lemma)] + ent_type = self.strings[attributes.get(ENT_TYPE, span.root.ent_type)] + # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) if span[-1].whitespace_: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 6fff4d93a..dc23481f6 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -77,8 +77,8 @@ cdef class Span: for i in range(self.start, self.end): yield self.doc[i] - def merge(self, unicode tag, unicode lemma, unicode ent_type): - self.doc.merge(self.start_char, self.end_char, tag, lemma, ent_type) + def merge(self, *args, **attributes): + self.doc.merge(self.start_char, self.end_char, *args, **attributes) def similarity(self, other): if 'similarity' in self.doc.getters_for_spans: