diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index b1b707c6a..e255dbb48 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -74,9 +74,9 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
-
+
cdef class StringStore:
- """Lookup strings by 64-bit hash"""
+ """Look up strings by 64-bit hashes."""
def __init__(self, strings=None, freeze=False):
"""Create the StringStore.
@@ -92,9 +92,9 @@ cdef class StringStore:
self.add(string)
def __getitem__(self, object string_or_id):
- """Retrieve a string from a given hash ID, or vice versa.
+ """Retrieve a string from a given hash, or vice versa.
- string_or_id (bytes or unicode or uint64): The value to encode.
+ string_or_id (bytes, unicode or uint64): The value to encode.
Returns (unicode or uint64): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
@@ -123,6 +123,11 @@ cdef class StringStore:
return decode_Utf8Str(utf8str)
def add(self, string):
+ """Add a string to the StringStore.
+
+ string (unicode): The string to add.
+ RETURNS (uint64): The string's hash value.
+ """
if isinstance(string, unicode):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg
index 644453737..119175247 100644
--- a/website/assets/img/docs/vocab_stringstore.svg
+++ b/website/assets/img/docs/vocab_stringstore.svg
@@ -7,30 +7,30 @@
- 3572
+ 31979...
Lexeme
- 508
+ 46904...
Lexeme
- 949
+ 37020...
Lexeme
"coffee"
- 3672
+ 31979…
"I"
- 508
+ 46904…
"love"
- 949
+ 37020…
diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade
index f09352c79..0665f6060 100644
--- a/website/docs/api/stringstore.jade
+++ b/website/docs/api/stringstore.jade
@@ -2,14 +2,16 @@
include ../../_includes/_mixins
-p Map strings to and from integer IDs.
+p
+ | Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values
+ | instead of integer IDs. This ensures that strings always map to the
+ | same ID, even from different #[code StringStores].
+h(2, "init") StringStore.__init__
+tag method
p
- | Create the #[code StringStore]. Note that a newly initialised store will
- | always include an empty string #[code ''] at position #[code 0].
+ | Create the #[code StringStore].
+aside-code("Example").
from spacy.strings import StringStore
@@ -44,17 +46,18 @@ p Get the number of strings in the store.
+h(2, "getitem") StringStore.__getitem__
+tag method
-p Retrieve a string from a given integer ID, or vice versa.
+p Retrieve a string from a given hash, or vice versa.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
- int_id = stringstore[u'apple'] # 1
- assert stringstore[int_id] == u'apple'
+ apple_hash = stringstore[u'apple']
+ assert apple_hash == 8566208034543834098L
+ assert stringstore[apple_hash] == u'apple'
+table(["Name", "Type", "Description"])
+row
+cell #[code string_or_id]
- +cell bytes, unicode or int
+ +cell bytes, unicode or uint64
+cell The value to encode.
+footrow
@@ -94,7 +97,7 @@ p
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
all_strings = [s for s in stringstore]
- assert all_strings == [u'', u'apple', u'orange']
+ assert all_strings == [u'apple', u'orange']
+table(["Name", "Type", "Description"])
+footrow
@@ -102,6 +105,30 @@ p
+cell unicode
+cell A string in the store.
++h(2, "add") StringStore.add
+ +tag method
+ +tag-new(2)
+
+p Add a string to the #[code StringStore].
+
++aside-code("Example").
+ stringstore = StringStore([u'apple', u'orange'])
+ stringstore.add(u'banana')
+ assert len(stringstore) == 3
+ assert stringstore[u'banana'] == 2525716904149915114L
+
++table(["Name", "Type", "Description"])
+ +row
+ +cell #[code string]
+ +cell unicode
+ +cell The string to add.
+
+ +footrow
+ +cell returns
+ +cell uint64
+ +cell The string's hash value.
+
+
+h(2, "to_disk") StringStore.to_disk
+tag method
+tag-new(2)
diff --git a/website/docs/usage/_spacy-101/_vocab.jade b/website/docs/usage/_spacy-101/_vocab.jade
index dd300b5b9..45a16af80 100644
--- a/website/docs/usage/_spacy-101/_vocab.jade
+++ b/website/docs/usage/_spacy-101/_vocab.jade
@@ -4,10 +4,10 @@ p
| Whenever possible, spaCy tries to store data in a vocabulary, the
| #[+api("vocab") #[code Vocab]], that will be
| #[strong shared by multiple documents]. To save memory, spaCy also
- | encodes all strings to #[strong integer IDs] – in this case for example,
- | "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
- | part-of-speech tags like "VERB" are also encoded. Internally, spaCy
- | only "speaks" in integer IDs.
+ | encodes all strings to #[strong hash values] – in this case for example,
+ | "coffee" has the hash #[code 3197928453018144401L]. Entity labels like
+ | "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
+ | spaCy only "speaks" in hash values.
+aside
| #[strong Token]: A word, punctuation mark etc. #[em in context], including
@@ -16,8 +16,8 @@ p
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
| #[strong Doc]: A processed container of tokens in context.#[br]
| #[strong Vocab]: The collection of lexemes.#[br]
- | #[strong StringStore]: The dictionary mapping integer IDs to strings, for
- | example #[code 3672] → "coffee".
+ | #[strong StringStore]: The dictionary mapping hash values to strings, for
+ | example #[code 3197928453018144401L] → "coffee".
+image
include ../../../assets/img/docs/vocab_stringstore.svg
@@ -27,26 +27,26 @@ p
p
| If you process lots of documents containing the word "coffee" in all
| kinds of different contexts, storing the exact string "coffee" every time
- | would take up way too much space. So instead, spaCy assigns it an ID
+ | would take up way too much space. So instead, spaCy hashes the string
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can
| think of the #[code StringStore] as a
| #[strong lookup table that works in both directions] – you can look up a
- | string to get its ID, or an ID to get its string:
+ | string to get its hash, or a hash to get its string:
+code.
doc = nlp(u'I like coffee')
- assert doc.vocab.strings[u'coffee'] == 3572
- assert doc.vocab.strings[3572] == u'coffee'
+ assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
+ assert doc.vocab.strings[3197928453018144401L] == u'coffee'
p
| Now that all strings are encoded, the entries in the vocabulary
| #[strong don't need to include the word text] themselves. Instead,
- | they can look it up in the #[code StringStore] via its integer ID. Each
+ | they can look it up in the #[code StringStore] via its hash value. Each
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
| contains the #[strong context-independent] information about a word.
| For example, no matter if "love" is used as a verb or a noun in some
| context, its spelling and whether it consists of alphabetic characters
- | won't ever change.
+ | won't ever change. Its hash value will also always be the same.
+code.
for word in doc:
@@ -56,39 +56,54 @@ p
+aside
| #[strong Text]: The original text of the lexeme.#[br]
- | #[strong Orth]: The integer ID of the lexeme.#[br]
+ | #[strong Orth]: The hash value of the lexeme.#[br]
| #[strong Shape]: The abstract word shape of the lexeme.#[br]
| #[strong Prefix]: By default, the first letter of the word string.#[br]
| #[strong Suffix]: By default, the last three letters of the word string.#[br]
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong is digit]: Does the lexeme consist of digits?#[br]
- | #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
- | #[strong Lang]: The language of the parent vocabulary.
-+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
- - var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
- +annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
- +annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
- +annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
++table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
+ - var style = [0, 1, 1, 0, 0, 1, 1]
+ +annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style)
+ +annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style)
+ +annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style)
p
- | The specific entries in the voabulary and their IDs don't really matter –
- | #[strong as long as they match]. That's why you always need to make sure
- | all objects you create have access to the same vocabulary. If they don't,
- | the IDs won't match and spaCy will either produce very confusing results,
- | or fail alltogether.
+ | The mapping of words to hashes doesn't depend on any state. To make sure
+ | each value is unique, spaCy uses a
+ | #[+a("https://en.wikipedia.org/wiki/Hash_function") hash function] to
+ | calculate the hash #[strong based on the word string]. This also means
+ | that the hash for "coffee" will always be the same, no matter which model
+ | you're using or how you've configured spaCy.
+
+p
+ | However, hashes #[strong cannot be reversed] and there's no way to
+ | resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do
+ | is look it up in the vocabulary. That's why you always need to make
+ | sure all objects you create have access to the same vocabulary. If they
+ | don't, spaCy might not be able to find the strings it needs.
+code.
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = nlp(u'I like coffee') # original Doc
- new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
- assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
- assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
+ assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash
+ assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
+
+ empty_doc = Doc(Vocab()) # new Doc with empty Vocab
+ # doc.vocab.strings[3197928453018144401L] will raise an error :(
+
+ empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
+ assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
+
+ new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
+ assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
p
- | Even though both #[code Doc] objects contain the same words, the internal
- | integer IDs are very different. The same applies for all other strings,
- | like the annotation scheme. To avoid mismatched IDs, spaCy will always
- | export the vocab if you save a #[code Doc] or #[code nlp] object.
+ | If the doc's vocabulary doesn't contain a hash for "coffee", spaCy will
+ | throw an error. So you either need to add it manually, or initialise the
+ | new #[code Doc] with the shared vocab. To prevent this problem, spaCy
+ | will ususally export the vocab when you save a #[code Doc] or #[code nlp]
+ | object.
diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade
index 107e7210f..a87e763a6 100644
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@@ -68,13 +68,19 @@ p
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
-+h(2, "examples-integer-ids") Use integer IDs for any string
++h(2, "examples-hashes") Use hash values for any string
+code.
- hello_id = nlp.vocab.strings['Hello']
- hello_str = nlp.vocab.strings[hello_id]
- assert token.text == hello_id == 3125
- assert token.text == hello_str == 'Hello'
+ doc = nlp(u'I love coffee')
+ coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L
+ coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
+
+ assert doc[2].orth == coffee_hash == 3197928453018144401L
+ assert doc[2].text == coffee_text == u'coffee'
+
+ doc.vocab.strings.add(u'beer')
+ beer_hash = doc.vocab.strings[u'beer'] # 3073001599257881079L
+ beer_text = doc.vocab.strings[beer_hash] # 'beer'
+h(2, "examples-entities") Recongnise and update named entities
+tag-model("NER")
diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade
index db827c414..afdf50efb 100644
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@@ -50,6 +50,28 @@ p
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
++h(3, "features-hash-ids") Hash values instead of integer IDs
+
++aside-code("Example").
+ doc = nlp(u'I love coffee')
+ assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
+ assert doc.vocab.strings[3197928453018144401L] == u'coffee'
+
+ doc.vocab.strings.add(u'beer')
+ assert doc.vocab.strings[u'beer'] == 3073001599257881079L
+
+p
+ | The #[+api("stringstore") #[code StringStore]] now resolves all strings
+ | to hash values instead of integer IDs. This means that the string-to-int
+ | mapping #[strong no longer depends on the vocabulary state], making a lot
+ | of workflows much simpler, especially during training. Unlike integer IDs
+ | in spaCy v1.x, hash values will #[strong always match] – even across
+ | models. Strings can now be added explicitly using the new #[+api("stringstore#add") #[code Stringstore.add]] method.
+
++infobox
+ | #[strong API:] #[+api("stringstore") #[code StringStore]]
+ | #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+
+h(3, "features-serializer") Saving, loading and serialization
+aside-code("Example").
@@ -307,6 +329,17 @@ p
nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab')
++h(3, "migrating-strings") Strings and hash values
+
++code-new.
+ nlp.vocab.strings.add(u'coffee')
+ nlp.vocab.strings[u'coffee'] # 3197928453018144401L
+ other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L
+
++code-old.
+ nlp.vocab.strings[u'coffee'] # 3672
+ other_nlp.vocab.strings[u'coffee'] # 40259
+
+h(3, "migrating-languages") Processing pipelines and language data
p
diff --git a/website/index.jade b/website/index.jade
index 17b564b42..b4e987cfb 100644
--- a/website/index.jade
+++ b/website/index.jade
@@ -97,7 +97,7 @@ include _includes/_mixins
+item Part-of-speech tagging
+item #[strong Named entity] recognition
+item Labelled dependency parsing
- +item Convenient string-to-int mapping
+ +item Convenient string-to-hash mapping
+item Export to numpy data arrays
+item GIL-free #[strong multi-threading]
+item Efficient binary serialization