From 064bd69ad03fc90cec13ad93fdbfbdb70980cf86 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 7 Oct 2015 00:39:50 +1100 Subject: [PATCH] * Refactor symbols, so that frequency rank can be derived from the orth id of a word. --- bin/init_model.py | 5 +++ setup.py | 3 +- spacy/attrs.pxd | 4 +- spacy/attrs.pyx | 90 +++++++++++++++++++++++++++++++++++++++ spacy/matcher.pyx | 2 +- spacy/parts_of_speech.pxd | 45 ++++++++++---------- spacy/vocab.pyx | 15 +++++++ 7 files changed, 138 insertions(+), 26 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 72d7a3aae..6e44fd444 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -168,6 +168,11 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): probs[word] = oov_prob lexicon = [] + for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): + # First encode the strings into the StringStore. This way, we can map + # the orth IDs to frequency ranks + orth = vocab.strings[word] + # Now actually load the vocab for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): lexeme = vocab[word] lexeme.prob = prob diff --git a/setup.py b/setup.py index 3036db94c..fb6a5b718 100644 --- a/setup.py +++ b/setup.py @@ -165,7 +165,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', - 'spacy.syntax.ner'] + 'spacy.syntax.ner', + 'spacy.symbols'] if __name__ == '__main__': diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index c810762ef..d0f476dcd 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,5 +1,6 @@ # Reserve 64 values for flag features cpdef enum attr_id_t: + NULL_ATTR IS_ALPHA IS_ASCII IS_DIGIT @@ -14,8 +15,7 @@ cpdef enum attr_id_t: IS_STOP IS_OOV - FLAG13 = 13 - FLAG14 + FLAG14 = 14 FLAG15 FLAG16 FLAG17 diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index e69de29bb..8ce0f7a17 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -0,0 +1,90 @@ +ATTR_IDS = { + "NULL_ATTR": NULL_ATTR, + "IS_ALPHA": IS_ALPHA, + "IS_ASCII": IS_ASCII, + "IS_DIGIT": IS_DIGIT, + "IS_LOWER": IS_LOWER, + "IS_PUNCT": IS_PUNCT, + "IS_SPACE": IS_SPACE, + "IS_TITLE": IS_TITLE, + "IS_UPPER": IS_UPPER, + "LIKE_URL": LIKE_URL, + "LIKE_NUM": LIKE_NUM, + "LIKE_EMAIL": LIKE_EMAIL, + "IS_STOP": IS_STOP, + "IS_OOV": IS_OOV, + + "FLAG14": FLAG14, + "FLAG15": FLAG15, + "FLAG16": FLAG16, + "FLAG17": FLAG17, + "FLAG18": FLAG18, + "FLAG19": FLAG19, + "FLAG20": FLAG20, + "FLAG21": FLAG21, + "FLAG22": FLAG22, + "FLAG23": FLAG23, + "FLAG24": FLAG24, + "FLAG25": FLAG25, + "FLAG26": FLAG26, + "FLAG27": FLAG27, + "FLAG28": FLAG28, + "FLAG29": FLAG29, + "FLAG30": FLAG30, + "FLAG31": FLAG31, + "FLAG32": FLAG32, + "FLAG33": FLAG33, + "FLAG34": FLAG34, + "FLAG35": FLAG35, + "FLAG36": FLAG36, + "FLAG37": FLAG37, + "FLAG38": FLAG38, + "FLAG39": FLAG39, + "FLAG40": FLAG40, + "FLAG41": FLAG41, + "FLAG42": FLAG42, + "FLAG43": FLAG43, + "FLAG44": FLAG44, + "FLAG45": FLAG45, + "FLAG46": FLAG46, + "FLAG47": FLAG47, + "FLAG48": FLAG48, + "FLAG49": FLAG49, + "FLAG50": FLAG50, + "FLAG51": FLAG51, + "FLAG52": FLAG52, + "FLAG53": FLAG53, + "FLAG54": FLAG54, + "FLAG55": FLAG55, + "FLAG56": FLAG56, + "FLAG57": FLAG57, + "FLAG58": FLAG58, + "FLAG59": FLAG59, + "FLAG60": FLAG60, + "FLAG61": FLAG61, + "FLAG62": FLAG62, + "FLAG63": FLAG63, + + "ID": ID, + "ORTH": ORTH, + "LOWER": LOWER, + "NORM": NORM, + "SHAPE": SHAPE, + "PREFIX": PREFIX, + "SUFFIX": SUFFIX, + + "LENGTH": LENGTH, + "CLUSTER": CLUSTER, + "LEMMA": LEMMA, + "POS": POS, + "TAG": TAG, + "DEP": DEP, + "ENT_IOB": ENT_IOB, + "ENT_TYPE": ENT_TYPE, + "HEAD": HEAD, + "SPACY": SPACY, + "PROB": PROB, +} + +# ATTR IDs, in order of the symbol +ATTR_NAMES = [key for key, value in sorted(ATTR_IDS.items(), key=lambda item: item[1])] diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index afafd3ddb..3ee825932 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -15,7 +15,7 @@ from libcpp.vector cimport vector from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE -from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 +from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index e410c6971..17e349435 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -1,23 +1,24 @@ -# Google universal tag set +from .symbols cimport * + + cpdef enum univ_pos_t: - NO_TAG - ADJ - ADP - ADV - AUX - CONJ - DET - INTJ - NOUN - NUM - PART - PRON - PROPN - PUNCT - SCONJ - SYM - VERB - X - EOL - SPACE - N_UNIV_TAGS + NO_TAG = EMPTY_VALUE + ADJ = POS_adj + ADP = POS_adp + ADV = POS_adv + AUX = POS_aux + CONJ = POS_conj + DET = POS_det + INTJ = POS_intj + NOUN = POS_noun + NUM = POS_num + PART = POS_part + PRON = POS_pron + PROPN = POS_propn + PUNCT = POS_punct + SCONJ = POS_sconj + SYM = POS_sym + VERB = POS_verb + X = POS_x + EOL = POS_eol + SPACE = POS_space diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d79da8a79..caf3045f5 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -67,6 +67,21 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + # Load strings in a special order, so that we have an onset number for + # the vocabulary. This way, when words are added in order, the orth ID + # is the frequency rank of the word, plus a certain offset. The structural + # strings are loaded first, because the vocab is open-class, and these + # symbols are closed class. + #for attr_name in sorted(ATTR_NAMES.keys()): + # _ = self.strings[attr_name] + #for univ_pos_name in sorted(UNIV_POS_NAMES.keys()): + # _ = self.strings[pos_name] + #for morph_name in sorted(UNIV_MORPH_NAMES.keys()): + # _ = self.strings[morph_name] + #for entity_type_name in sorted(ENTITY_TYPES.keys()): + # _ = self.strings[entity_type_name] + #for tag_name in sorted(TAG_MAP.keys()): + # _ = self.strings[tag_name] self.get_lex_attr = get_lex_attr self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.serializer_freqs = serializer_freqs