* Start setting out how NER will be implemented in the data model

This commit is contained in:
Matthew Honnibal 2015-02-02 16:35:58 +11:00
parent 0962ffc095
commit b139aa92ba
1 changed files with 20 additions and 1 deletions

View File

@ -1,4 +1,4 @@
from libc.stdint cimport uint8_t, uint32_t from libc.stdint cimport int8_t, uint8_t, uint16_t, uint32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .parts_of_speech cimport univ_pos_t from .parts_of_speech cimport univ_pos_t
@ -42,9 +42,28 @@ cdef struct PosTag:
univ_pos_t pos univ_pos_t pos
# Start and end will be offsets: i + ent.start will always take you to the
# "next" entity start. If inside an entity, ent.start will be negative ---
# the next entity is the start of the one the token is inside. If i _is_
# the start of an entity, then ent.start will be the beginning of the next one.
#
# The same/inverse is true for end. If ent.end has a negative value, we are either
# at the end of an entity, or outside one. If we're inside an entity, ent.end
# will have a positive value.
#
# This allows us to easily find the span of an entity we might be inside, while
# naturally sharing an API with iterating through all entities in the sentence
cdef struct Entity:
int32_t tag
uint16_t flags
int8_t start
int8_t end
cdef struct TokenC: cdef struct TokenC:
const LexemeC* lex const LexemeC* lex
Morphology morph Morphology morph
Entity ent
univ_pos_t pos univ_pos_t pos
int tag int tag
int idx int idx