From b139aa92ba137a2a6c604eccc300403f9f7124d8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 2 Feb 2015 16:35:58 +1100
Subject: [PATCH] * Start setting out how NER will be implemented in the data
 model

---
 spacy/structs.pxd | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 53b384f1f..da5322ce2 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -1,4 +1,4 @@
-from libc.stdint cimport uint8_t, uint32_t
+from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t
 
 from .typedefs cimport flags_t, attr_t, id_t, hash_t
 from .parts_of_speech cimport univ_pos_t
@@ -42,9 +42,28 @@ cdef struct PosTag:
     univ_pos_t pos
 
 
+# Start and end will be offsets: i + ent.start will always take you to the
+# "next" entity start. If inside an entity, ent.start will be negative ---
+# the next entity is the start of the one the token is inside.  If i _is_
+# the start of an entity, then ent.start will be the beginning of the next one.
+#
+# The same/inverse is true for end. If ent.end has a negative value, we are either
+# at the end of an entity, or outside one.  If we're inside an entity, ent.end
+# will have a positive value.
+#
+# This allows us to easily find the span of an entity we might be inside, while
+# naturally sharing an API with iterating through all entities in the sentence
+cdef struct Entity:
+    int32_t tag
+    uint16_t flags
+    int8_t start
+    int8_t end
+
+
 cdef struct TokenC:
     const LexemeC* lex
     Morphology morph
+    Entity ent
     univ_pos_t pos
     int tag
     int idx