From b139aa92ba137a2a6c604eccc300403f9f7124d8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Feb 2015 16:35:58 +1100 Subject: [PATCH] * Start setting out how NER will be implemented in the data model --- spacy/structs.pxd | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 53b384f1f..da5322ce2 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,4 +1,4 @@ -from libc.stdint cimport uint8_t, uint32_t +from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t from .typedefs cimport flags_t, attr_t, id_t, hash_t from .parts_of_speech cimport univ_pos_t @@ -42,9 +42,28 @@ cdef struct PosTag: univ_pos_t pos +# Start and end will be offsets: i + ent.start will always take you to the +# "next" entity start. If inside an entity, ent.start will be negative --- +# the next entity is the start of the one the token is inside. If i _is_ +# the start of an entity, then ent.start will be the beginning of the next one. +# +# The same/inverse is true for end. If ent.end has a negative value, we are either +# at the end of an entity, or outside one. If we're inside an entity, ent.end +# will have a positive value. +# +# This allows us to easily find the span of an entity we might be inside, while +# naturally sharing an API with iterating through all entities in the sentence +cdef struct Entity: + int32_t tag + uint16_t flags + int8_t start + int8_t end + + cdef struct TokenC: const LexemeC* lex Morphology morph + Entity ent univ_pos_t pos int tag int idx