* Start setting out how NER will be implemented in the data model

2015-02-02 16:35:58 +11:00 · 2015-02-02 16:35:58 +11:00 · b139aa92ba
parent 0962ffc095
commit b139aa92ba
1 changed files with 20 additions and 1 deletions
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -1,4 +1,4 @@
-from libc.stdint cimport uint8_t, uint32_t
+from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t

 from .typedefs cimport flags_t, attr_t, id_t, hash_t
 from .parts_of_speech cimport univ_pos_t
@ -42,9 +42,28 @@ cdef struct PosTag:
    univ_pos_t pos


+# Start and end will be offsets: i + ent.start will always take you to the
+# "next" entity start. If inside an entity, ent.start will be negative ---
+# the next entity is the start of the one the token is inside.  If i _is_
+# the start of an entity, then ent.start will be the beginning of the next one.
+#
+# The same/inverse is true for end. If ent.end has a negative value, we are either
+# at the end of an entity, or outside one.  If we're inside an entity, ent.end
+# will have a positive value.
+#
+# This allows us to easily find the span of an entity we might be inside, while
+# naturally sharing an API with iterating through all entities in the sentence
+cdef struct Entity:
+    int32_t tag
+    uint16_t flags
+    int8_t start
+    int8_t end
+
+
 cdef struct TokenC:
    const LexemeC* lex
    Morphology morph
+    Entity ent
    univ_pos_t pos
    int tag
    int idx