From 334c4b2b57c5b671bd8e5348680fbbfc6fda8a6b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 18 Jan 2016 18:14:09 +0100
Subject: [PATCH] * Disprefer punctuation and spaces as heads of spans

---
 spacy/tokens/span.pyx | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 455cd2045..e4e6ad582 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -12,6 +12,8 @@ from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..util import normalize_slice
 from .doc cimport token_by_start, token_by_end
+from ..attrs cimport IS_PUNCT, IS_SPACE
+from ..lexeme cimport Lexeme
 
 
 cdef class Span:
@@ -183,12 +185,17 @@ cdef class Span:
             # For each word, we count the path length, and arg min this measure.
             # We could use better tree logic to save steps here...But I think this
             # should be okay.
-            cdef int current_best = _count_words_to_root(&self.doc.c[self.start],
-                                                         self.doc.length)
-            cdef int root = self.start
+            cdef int current_best = self.doc.length
+            cdef int root = -1
             for i in range(self.start, self.end):
                 if self.start <= (i+self.doc.c[i].head) < self.end:
                     continue
+                # Don't allow punctuation or spaces to be the root, if there are
+                # better candidates
+                if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT):
+                    continue
+                if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE):
+                    continue
                 words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
                 if words_to_root < current_best:
                     current_best = words_to_root