diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b1b707c6a..e255dbb48 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -74,9 +74,9 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] return string - + cdef class StringStore: - """Lookup strings by 64-bit hash""" + """Look up strings by 64-bit hashes.""" def __init__(self, strings=None, freeze=False): """Create the StringStore. @@ -92,9 +92,9 @@ cdef class StringStore: self.add(string) def __getitem__(self, object string_or_id): - """Retrieve a string from a given hash ID, or vice versa. + """Retrieve a string from a given hash, or vice versa. - string_or_id (bytes or unicode or uint64): The value to encode. + string_or_id (bytes, unicode or uint64): The value to encode. Returns (unicode or uint64): The value to be retrieved. """ if isinstance(string_or_id, basestring) and len(string_or_id) == 0: @@ -123,6 +123,11 @@ cdef class StringStore: return decode_Utf8Str(utf8str) def add(self, string): + """Add a string to the StringStore. + + string (unicode): The string to add. + RETURNS (uint64): The string's hash value. + """ if isinstance(string, unicode): if string in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string] diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg index 644453737..119175247 100644 --- a/website/assets/img/docs/vocab_stringstore.svg +++ b/website/assets/img/docs/vocab_stringstore.svg @@ -7,30 +7,30 @@ - 3572 + 31979... Lexeme - 508 + 46904... Lexeme - 949 + 37020... Lexeme "coffee" - 3672 + 31979… "I" - 508 + 46904… "love" - 949 + 37020… diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index f09352c79..0665f6060 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -2,14 +2,16 @@ include ../../_includes/_mixins -p Map strings to and from integer IDs. +p + | Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values + | instead of integer IDs. This ensures that strings always map to the + | same ID, even from different #[code StringStores]. +h(2, "init") StringStore.__init__ +tag method p - | Create the #[code StringStore]. Note that a newly initialised store will - | always include an empty string #[code ''] at position #[code 0]. + | Create the #[code StringStore]. +aside-code("Example"). from spacy.strings import StringStore @@ -44,17 +46,18 @@ p Get the number of strings in the store. +h(2, "getitem") StringStore.__getitem__ +tag method -p Retrieve a string from a given integer ID, or vice versa. +p Retrieve a string from a given hash, or vice versa. +aside-code("Example"). stringstore = StringStore([u'apple', u'orange']) - int_id = stringstore[u'apple'] # 1 - assert stringstore[int_id] == u'apple' + apple_hash = stringstore[u'apple'] + assert apple_hash == 8566208034543834098L + assert stringstore[apple_hash] == u'apple' +table(["Name", "Type", "Description"]) +row +cell #[code string_or_id] - +cell bytes, unicode or int + +cell bytes, unicode or uint64 +cell The value to encode. +footrow @@ -94,7 +97,7 @@ p +aside-code("Example"). stringstore = StringStore([u'apple', u'orange']) all_strings = [s for s in stringstore] - assert all_strings == [u'', u'apple', u'orange'] + assert all_strings == [u'apple', u'orange'] +table(["Name", "Type", "Description"]) +footrow @@ -102,6 +105,30 @@ p +cell unicode +cell A string in the store. ++h(2, "add") StringStore.add + +tag method + +tag-new(2) + +p Add a string to the #[code StringStore]. + ++aside-code("Example"). + stringstore = StringStore([u'apple', u'orange']) + stringstore.add(u'banana') + assert len(stringstore) == 3 + assert stringstore[u'banana'] == 2525716904149915114L + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to add. + + +footrow + +cell returns + +cell uint64 + +cell The string's hash value. + + +h(2, "to_disk") StringStore.to_disk +tag method +tag-new(2) diff --git a/website/docs/usage/_spacy-101/_vocab.jade b/website/docs/usage/_spacy-101/_vocab.jade index dd300b5b9..45a16af80 100644 --- a/website/docs/usage/_spacy-101/_vocab.jade +++ b/website/docs/usage/_spacy-101/_vocab.jade @@ -4,10 +4,10 @@ p | Whenever possible, spaCy tries to store data in a vocabulary, the | #[+api("vocab") #[code Vocab]], that will be | #[strong shared by multiple documents]. To save memory, spaCy also - | encodes all strings to #[strong integer IDs] – in this case for example, - | "coffee" has the ID #[code 3672]. Entity labels like "ORG" and - | part-of-speech tags like "VERB" are also encoded. Internally, spaCy - | only "speaks" in integer IDs. + | encodes all strings to #[strong hash values] – in this case for example, + | "coffee" has the hash #[code 3197928453018144401L]. Entity labels like + | "ORG" and part-of-speech tags like "VERB" are also encoded. Internally, + | spaCy only "speaks" in hash values. +aside | #[strong Token]: A word, punctuation mark etc. #[em in context], including @@ -16,8 +16,8 @@ p | and flags, e.g. if it's lowercase, a digit or punctuation.#[br] | #[strong Doc]: A processed container of tokens in context.#[br] | #[strong Vocab]: The collection of lexemes.#[br] - | #[strong StringStore]: The dictionary mapping integer IDs to strings, for - | example #[code 3672] → "coffee". + | #[strong StringStore]: The dictionary mapping hash values to strings, for + | example #[code 3197928453018144401L] → "coffee". +image include ../../../assets/img/docs/vocab_stringstore.svg @@ -27,26 +27,26 @@ p p | If you process lots of documents containing the word "coffee" in all | kinds of different contexts, storing the exact string "coffee" every time - | would take up way too much space. So instead, spaCy assigns it an ID + | would take up way too much space. So instead, spaCy hashes the string | and stores it in the #[+api("stringstore") #[code StringStore]]. You can | think of the #[code StringStore] as a | #[strong lookup table that works in both directions] – you can look up a - | string to get its ID, or an ID to get its string: + | string to get its hash, or a hash to get its string: +code. doc = nlp(u'I like coffee') - assert doc.vocab.strings[u'coffee'] == 3572 - assert doc.vocab.strings[3572] == u'coffee' + assert doc.vocab.strings[u'coffee'] == 3197928453018144401L + assert doc.vocab.strings[3197928453018144401L] == u'coffee' p | Now that all strings are encoded, the entries in the vocabulary | #[strong don't need to include the word text] themselves. Instead, - | they can look it up in the #[code StringStore] via its integer ID. Each + | they can look it up in the #[code StringStore] via its hash value. Each | entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]], | contains the #[strong context-independent] information about a word. | For example, no matter if "love" is used as a verb or a noun in some | context, its spelling and whether it consists of alphabetic characters - | won't ever change. + | won't ever change. Its hash value will also always be the same. +code. for word in doc: @@ -56,39 +56,54 @@ p +aside | #[strong Text]: The original text of the lexeme.#[br] - | #[strong Orth]: The integer ID of the lexeme.#[br] + | #[strong Orth]: The hash value of the lexeme.#[br] | #[strong Shape]: The abstract word shape of the lexeme.#[br] | #[strong Prefix]: By default, the first letter of the word string.#[br] | #[strong Suffix]: By default, the last three letters of the word string.#[br] | #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br] | #[strong is digit]: Does the lexeme consist of digits?#[br] - | #[strong is title]: Does the lexeme consist of alphabetic characters?#[br] - | #[strong Lang]: The language of the parent vocabulary. -+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"]) - - var style = [0, 1, 1, 0, 0, 1, 1, 1, 0] - +annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style) - +annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style) - +annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style) ++table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"]) + - var style = [0, 1, 1, 0, 0, 1, 1] + +annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style) + +annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style) + +annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style) p - | The specific entries in the voabulary and their IDs don't really matter – - | #[strong as long as they match]. That's why you always need to make sure - | all objects you create have access to the same vocabulary. If they don't, - | the IDs won't match and spaCy will either produce very confusing results, - | or fail alltogether. + | The mapping of words to hashes doesn't depend on any state. To make sure + | each value is unique, spaCy uses a + | #[+a("https://en.wikipedia.org/wiki/Hash_function") hash function] to + | calculate the hash #[strong based on the word string]. This also means + | that the hash for "coffee" will always be the same, no matter which model + | you're using or how you've configured spaCy. + +p + | However, hashes #[strong cannot be reversed] and there's no way to + | resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do + | is look it up in the vocabulary. That's why you always need to make + | sure all objects you create have access to the same vocabulary. If they + | don't, spaCy might not be able to find the strings it needs. +code. from spacy.tokens import Doc from spacy.vocab import Vocab doc = nlp(u'I like coffee') # original Doc - new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab - assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc - assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc + assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash + assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + + empty_doc = Doc(Vocab()) # new Doc with empty Vocab + # doc.vocab.strings[3197928453018144401L] will raise an error :( + + empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash + assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 + + new_doc = Doc(doc.vocab) # create new doc with first doc's vocab + assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍 p - | Even though both #[code Doc] objects contain the same words, the internal - | integer IDs are very different. The same applies for all other strings, - | like the annotation scheme. To avoid mismatched IDs, spaCy will always - | export the vocab if you save a #[code Doc] or #[code nlp] object. + | If the doc's vocabulary doesn't contain a hash for "coffee", spaCy will + | throw an error. So you either need to add it manually, or initialise the + | new #[code Doc] with the shared vocab. To prevent this problem, spaCy + | will ususally export the vocab when you save a #[code Doc] or #[code nlp] + | object. diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 107e7210f..a87e763a6 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -68,13 +68,19 @@ p | #[strong API:] #[+api("token") #[code Token]] | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging] -+h(2, "examples-integer-ids") Use integer IDs for any string ++h(2, "examples-hashes") Use hash values for any string +code. - hello_id = nlp.vocab.strings['Hello'] - hello_str = nlp.vocab.strings[hello_id] - assert token.text == hello_id == 3125 - assert token.text == hello_str == 'Hello' + doc = nlp(u'I love coffee') + coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L + coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' + + assert doc[2].orth == coffee_hash == 3197928453018144401L + assert doc[2].text == coffee_text == u'coffee' + + doc.vocab.strings.add(u'beer') + beer_hash = doc.vocab.strings[u'beer'] # 3073001599257881079L + beer_text = doc.vocab.strings[beer_hash] # 'beer' +h(2, "examples-entities") Recongnise and update named entities +tag-model("NER") diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index db827c414..afdf50efb 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -50,6 +50,28 @@ p | #[strong API:] #[+api("language") #[code Language]] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] ++h(3, "features-hash-ids") Hash values instead of integer IDs + ++aside-code("Example"). + doc = nlp(u'I love coffee') + assert doc.vocab.strings[u'coffee'] == 3197928453018144401L + assert doc.vocab.strings[3197928453018144401L] == u'coffee' + + doc.vocab.strings.add(u'beer') + assert doc.vocab.strings[u'beer'] == 3073001599257881079L + +p + | The #[+api("stringstore") #[code StringStore]] now resolves all strings + | to hash values instead of integer IDs. This means that the string-to-int + | mapping #[strong no longer depends on the vocabulary state], making a lot + | of workflows much simpler, especially during training. Unlike integer IDs + | in spaCy v1.x, hash values will #[strong always match] – even across + | models. Strings can now be added explicitly using the new #[+api("stringstore#add") #[code Stringstore.add]] method. + ++infobox + | #[strong API:] #[+api("stringstore") #[code StringStore]] + | #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101] + +h(3, "features-serializer") Saving, loading and serialization +aside-code("Example"). @@ -307,6 +329,17 @@ p nlp.save_to_directory('/model') nlp.vocab.dump('/vocab') ++h(3, "migrating-strings") Strings and hash values + ++code-new. + nlp.vocab.strings.add(u'coffee') + nlp.vocab.strings[u'coffee'] # 3197928453018144401L + other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L + ++code-old. + nlp.vocab.strings[u'coffee'] # 3672 + other_nlp.vocab.strings[u'coffee'] # 40259 + +h(3, "migrating-languages") Processing pipelines and language data p diff --git a/website/index.jade b/website/index.jade index 17b564b42..b4e987cfb 100644 --- a/website/index.jade +++ b/website/index.jade @@ -97,7 +97,7 @@ include _includes/_mixins +item Part-of-speech tagging +item #[strong Named entity] recognition +item Labelled dependency parsing - +item Convenient string-to-int mapping + +item Convenient string-to-hash mapping +item Export to numpy data arrays +item GIL-free #[strong multi-threading] +item Efficient binary serialization