From d528b6e36dd13d70238b085191f844728d8a7535 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 03:22:49 +0200 Subject: [PATCH] Add assign_untagged method in Morphology --- spacy/morphology.pxd | 2 ++ spacy/morphology.pyx | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 922843d6d..be6711bfd 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,6 +35,8 @@ cdef class Morphology: cdef RichTagC* rich_tags cdef PreshMapArray _cache + cdef int assign_untagged(self, TokenC* token) except -1 + cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 5ee11c151..5a4399698 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ cdef class Morphology: self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} - self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) + self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) @@ -52,6 +52,10 @@ cdef class Morphology: self.rich_tags[i].morph = 0 self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i + # Add a 'null' tag, which we can reference when assign morphology to + # untagged tokens. + self.rich_tags[self.n_tags].id = self.n_tags + self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: @@ -62,6 +66,10 @@ cdef class Morphology: return (Morphology, (self.strings, self.tag_map, self.lemmatizer, self.exc), None, None) + cdef int assign_untagged(self, TokenC* token) except -1: + '''Set morphological attributes on a token without a POS tag.''' + token.lemma = self.lemmatize(0, token.lex.orth, {}) + cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): tag = self.strings.add(tag) @@ -72,7 +80,7 @@ cdef class Morphology: token.tag = tag cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: - if tag_id >= self.n_tags: + if tag_id > self.n_tags: raise ValueError("Unknown tag ID: %s" % tag_id) # TODO: It's pretty arbitrary to put this logic here. I guess the justification # is that this is where the specific word and the tag interact. Still, @@ -151,8 +159,6 @@ cdef class Morphology: cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - if univ_pos not in (NOUN, VERB, ADJ, PUNCT): - return self.strings.add(py_string.lower()) cdef set lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)