From b67697a97bfd78347b30d3974d81e56bc7137ffa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 17 Oct 2016 14:02:13 +0200
Subject: [PATCH] Improve API for doc.merge() and span.merge(), to use keyword
 arguments.

---
 spacy/tokens/doc.pyx  | 22 +++++++++++++++++++---
 spacy/tokens/span.pyx |  4 ++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5642dc624..95dd392fa 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -593,9 +593,22 @@ cdef class Doc:
                 keep_reading = False
             yield n_bytes_str + data
 
-    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
-              unicode ent_type):
+    def merge(self, int start_idx, int end_idx, *args, **attributes):
         """Merge a multi-word expression into a single token."""
+        cdef unicode tag, lemma, ent_type
+        if len(args) == 3:
+            # TODO: Warn deprecation
+            tag, lemma, ent_type = args
+            attributes[TAG] = self.strings[tag]
+            attributes[LEMMA] = self.strings[lemma]
+            attributes[ENT_TYPE] = self.strings[ent_type]
+        elif args:
+            raise ValueError(
+                "Doc.merge received %d non-keyword arguments. "
+                "Expected either 3 arguments (deprecated), or 0 (use keyword arguments). "
+                "Arguments supplied:\n%s\n"
+                "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
+ 
         cdef int start = token_by_start(self.c, self.length, start_idx)
         if start == -1:
             return None
@@ -604,8 +617,11 @@ cdef class Doc:
             return None
         # Currently we have the token index, we want the range-end index
         end += 1
-        
         cdef Span span = self[start:end]
+        tag = self.strings[attributes.get(TAG, span.root.tag)]
+        lemma = self.strings[attributes.get(LEMMA, span.root.lemma)]
+        ent_type = self.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
+
         # Get LexemeC for newly merged token
         new_orth = ''.join([t.text_with_ws for t in span])
         if span[-1].whitespace_:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 6fff4d93a..dc23481f6 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -77,8 +77,8 @@ cdef class Span:
         for i in range(self.start, self.end):
             yield self.doc[i]
 
-    def merge(self, unicode tag, unicode lemma, unicode ent_type):
-        self.doc.merge(self.start_char, self.end_char, tag, lemma, ent_type)
+    def merge(self, *args, **attributes):
+        self.doc.merge(self.start_char, self.end_char, *args, **attributes)
 
     def similarity(self, other):
         if 'similarity' in self.doc.getters_for_spans: