From c767ab9fdfdeaa513040fbf8c16d454ee2dacf54 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 20:21:26 +0200 Subject: [PATCH] * Work on documentation. Have overall structure now --- docs/redesign/spacy_docs.jade | 976 +++++++++++++++++++--------------- 1 file changed, 559 insertions(+), 417 deletions(-) diff --git a/docs/redesign/spacy_docs.jade b/docs/redesign/spacy_docs.jade index 5b64dd0da..29f0512e7 100644 --- a/docs/redesign/spacy_docs.jade +++ b/docs/redesign/spacy_docs.jade @@ -1,17 +1,19 @@ -- var unicode_type = 'unicode' -- var bool_type = 'bool' - -- var int_type = "" - -- var Token_type = "" -- var Span_type = "" -- var Vocab_type = "" -- var generator_type = "" +- var py_docs = 'unicode', + 'bool': py_docs + 'functions.html#bool">bool', + 'int': py_docs + 'functions.html#int">int', + 'generator': "", + 'Vocab': "", + 'Span': "", + 'Doc': "" + } mixin declare_class(name) - details(open="true") + details summary span.declaration span.label class @@ -62,14 +64,54 @@ mixin returns(name, type, value) mixin returns(type) | tmp +mixin init + details + summary: h4 Init + block + + +mixin callable + details + summary: h4 Callable + + block + + +mixin sequence + details + summary: h4 Sequence + + block + + +mixin maptype + details + summary: h4 Map + + block + + +mixin summary + block + +mixin en_example + pre.language-python + code + | from spacy.en import English + | from spacy._doc_examples import download_war_and_peace + | + | unprocessed_unicode = download_war_and_peace() + | + | nlp = English() + | doc = nlp(unprocessed_unicode) doctype html html(lang="en") head meta(charset="utf-8") - title!= tag_line + title spaCy – Industrial-strength NLP meta(name="description" content="") meta(name="author" content="Matthew Honnibal") link(rel="stylesheet" href="css/style.css") @@ -78,9 +120,9 @@ html(lang="en") body(id="docs") - header - h1.logo!= tag_line - div.slogan!= slogan + header(role="banner") + h1.logo spaCy – Industrial-strength NLP + div.slogan API nav(role="navigation") @@ -91,473 +133,573 @@ html(lang="en") li: a(href="#") Blog main.docs#content - section.intro - | Tmp article - h3: a(href="#") Header + +declare_class("English") + p Load models into a callable object to process English text. - +declare_class("spacy.en.English") - +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True") + +summary + +en_example - +params - +param("data_dir") - | The data directory. May be #{None}, to disable any data loading - | (including the vocabulary). + +init + p + | Load the resources. Loading takes 20 seconds, and the instance + | consumes 2 to 3 gigabytes of memory. + + p + | Intended use is for one instance to be created per process. + | You can create more if you're doing something unusual. + p + | You may wish to make the instance a global variable or "singleton". + | We usually instantiate the object in the main() + | function and pass it around as an explicit argument. + +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") - +param("Tokenizer") - | A class/function that creates the tokenizer. + +params + +param("data_dir") + | The data directory. May be #{None}, to disable any data loading + | (including the vocabulary). - +param("Tagger") - | A class/function that creates the part-of-speech tagger. + +param("Tokenizer") + | A class/function that creates the tokenizer. - +param("Parser") - | A class/function that creates the dependency parser. + +param("Tagger") + | A class/function that creates the part-of-speech tagger. - +param("Entity") - | A class/function that creates the named entity recogniser. + +param("Parser") + | A class/function that creates the dependency parser. - +param("load_vectors") - | A boolean value to control whether the word vectors are loaded. + +param("Entity") + | A class/function that creates the named entity recogniser. + +param("load_vectors") + | A boolean value to control whether the word vectors are loaded. + + +callable + +method("__call__", "text, tag=True, parse=True, entity=True") - +method("__call__", "text, tag=True, parse=True, entity=True")(open) + +params + +param("text", types.unicode) + | The text to be processed. No pre-processing needs to be applied, + | and any length of text can be submitted. Usually you will submit + | a whole document. Text may be zero-length. An exception is raised + | if byte strings are supplied. - +params - +param("text", unicode_type) - | The text to be processed. No pre-processing needs to be applied, - | and any length of text can be submitted. Usually you will submit - | a whole document. Text may be zero-length. An exception is raised - | if byte strings are supplied. + +param("tag", bool_type) + | Whether to apply the part-of-speech tagger. Required for parsing + | and entity recognition. - +param("tag", bool_type) - | Whether to apply the part-of-speech tagger. Required for parsing - | and entity recognition. + +param("parse", bool_type) + | Whether to apply the syntactic dependency parser. - +param("parse", bool_type) - | Whether to apply the syntactic dependency parser. + +param("entity", bool_type) + | Whether to apply the named entity recognizer. - +param("entity", bool_type) - | Whether to apply the named entity recognizer. + pre.language-python + code + | from spacy.en import English + | nlp = English() + | doc = nlp(u'Some text.) # Applies tagger, parser, entity + | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser + | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity + | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser + | doc = nlp(u'') # Zero-length tokens, not an error + | # doc = nlp(b'Some text') <-- Error: need unicode + | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. + + +declare_class("Doc") + p I'm a doc + + +init + +method("__init__", "vocab") + +params + +param("vocab", vocab_type) + | A vocabulary object + + +sequence + +method("__getitem__", "i", types.int) + +returns(types.Token) + + +method("__getitem__", "start_end", types.slice) + +returns(types.Span) + + +method("__iter__") + | Iterate over tokens + + +method("__len__") + | Number of tokens in the document. + + details + summary: h4 Spans + + +attribute("sents", types.generator) + | Iterate over sentences in the document. + + +attribute("ents", types.generator) + | Iterate over named entities in the document. + + +attribute("noun_chunks", types.generator) + + details + summary: h4 Export/Import + + +method("to_array", "attr_ids") + + | Given a list of M attribute IDs, export the tokens to a numpy ndarray + | of shape N*M, where N is the length of the sentence. + + +params + +param("attr_ids", "list[int]") + | A list of attribute ID ints. + + +returns("feat_array") + | A feature matrix, with one row per word, and one column per attribute + | indicated in the input attr_ids. + + +method("count_by", "attr_id") + | Produce a dict of {attribute (int): count (ints)} frequencies, keyed + | by the values of the given attribute ID. + pre.language-python code - | from spacy.en import English - | nlp = English() - | doc = nlp(u'Some text.) # Applies tagger, parser, entity - | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser - | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity - | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser - | doc = nlp(u'') # Zero-length tokens, not an error - | # doc = nlp(b'Some text') <-- Error: need unicode - | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - - +declare_class("spacy.tokens.doc.Doc") - +method("__init__", "vocab") - +params - +param("vocab", vocab_type) - | A vocabulary object - - +method("__getitem__", "i", int_type) - +returns(Token_type) - - +method("__getitem__", "start_end", slice_type) - +returns(Span_type) - - +method("__iter__") - | Iterate over tokens - - +method("__len__") - | Number of tokens in the document. - - +attribute("sents", generator_type) - | Iterate over sentences in the document. - - +attribute("ents", generator_type) - | Iterate over named entities in the document. + | >>> from spacy.en import English, attrs + | >>> nlp = English() + | >>> tokens = nlp(u'apple apple orange banana') + | >>> tokens.count_by(attrs.ORTH) + | {12800L: 1, 11880L: 2, 7561L: 1} + | >>> tokens.to_array([attrs.ORTH]) + | array([[11880], + | [11880], + | [7561], + | [12800]]) - +attribute("noun_chunks", generator_type) - - +method("to_array", "attr_ids") - - | Given a list of M attribute IDs, export the tokens to a numpy ndarray - | of shape N*M, where N is the length of the sentence. - - +params - - +param("attr_ids", "list[int]") - | A list of attribute ID ints. - - +returns("feat_array") - | A feature matrix, with one row per word, and one column per attribute - | indicated in the input attr_ids. - - +method("count_by", "attr_id") - - | Produce a dict of {attribute (int): count (ints)} frequencies, keyed - | by the values of the given attribute ID. + +method("from_array", "attrs, array") + | Load from array - pre.language-python - code - | >>> from spacy.en import English, attrs - | >>> nlp = English() - | >>> tokens = nlp(u'apple apple orange banana') - | >>> tokens.count_by(attrs.ORTH) - | {12800L: 1, 11880L: 2, 7561L: 1} - | >>> tokens.to_array([attrs.ORTH]) - | array([[11880], - | [11880], - | [7561], - | [12800]]) - - +method("from_array", "attrs, array") - | Load from array - - +method("to_bytes") - | Serialize - - +method("from_bytes") - | Deserialize, loading from bytes - - +method("read_bytes") - | classmethod - - +method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") - - | Merge a multi-word expression into a single token. Currently - | experimental; API is likely to change. - - - +declare_class("spacy.tokens.Token") - +method("__init__", "vocab, doc, offset") - +params - +param("vocab", Vocab_type) - p A Vocab object - - +param("doc", Doc_type) - p The parent sequence - - +param("offset", Int_type) - p The index of the token within the document - - details - summary: h4 String Views - - +attribute("orth / orth_") - | The form of the word with no string normalization or processing, as - | it appears in the string, without trailing whitespace. - - +attribute("lemma / lemma_") - | The "base" of the word, with no inflectional suffixes, e.g. the lemma of - | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - | derivational suffixes are not stripped, e.g. the lemma of - | "instutitions" is "institution", not "institute". Lemmatization is - | performed using the WordNet data, but extended to also cover closed-class - | words such as pronouns. By default, the WN lemmatizer returns "hi" - | as the lemma of "his". We assign pronouns the lemma -PRON-. - - +attribute("lower / lower_") - | The form of the word, but forced to lower-case, i.e. - pre.language-python: code lower = word.orth\_.lower() - - //+attribute("norm / norm_") - // | The form of the word, after language-specific normalizations has been - // | applied. - - +attribute("shape / shape_") - | A transform of the word's string, to show orthographic features. - | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped - | to d. After these mappings, sequences of 4 or more of the same character - | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - | :) --> :) - - +attribute("prefix / prefix_") - | A length-N substring from the start of the word. Length may vary by - | language; currently for English n=1, i.e. - pre.language-python: code prefix = word.orth\_[:1] - - +attribute("suffix / suffix_") - | A length-N substring from the end of the word. Length may vary by - | language; currently for English n=3, i.e. - pre.language-python: code suffix = word.orth\_[-3:] - - //+attribute("lex_id") - // | lex_id - - details - summary: h4 Alignment and Output - - +attribute("idx") - p Start index of the token in the string - - +method("__len__", "") - p Length of the token's orth string, in unicode code-points. - - +method("__unicode__", "") - p Same as token.orth_ - - +method("__str__", "") - p Varies between Python 2 and Python 3 - - +attribute("string") - p - | The form of the word as it appears in the string, including - | trailing whitespace. This is useful when you need to use - | linguistic features to add inline mark-up to the string. - - +method("nbor, i=1") - +params - +param("i") - p Offset relative to token + +method("from_bytes") + | Deserialize, loading from bytes - details - summary: h4 Distributional Features + +method("read_bytes") + | classmethod - +attribute("repvec") - p - | A "word embedding" representation: a dense real-valued vector that supports - | similarity queries between words. By default, spaCy currently loads - | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - | model. + //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") - +attribute("cluster") - p - | The Brown cluster ID of the word. These are often useful features for - | linear models. If you're using a non-linear model, particularly a - | neural net or random forest, consider using the real-valued word - | representation vector, in Token.repvec, instead. + // | Merge a multi-word expression into a single token. Currently + // | experimental; API is likely to change. - +attribute("prob") - p - | The unigram log-probability of the word, estimated from counts from a - | large corpus, smoothed using Simple Good Turing estimation. - details - summary: h4 Syntactic Tags + +declare_class("Token") + +init + +method("__init__", "vocab, doc, offset") + +params + +param("vocab", types.Vocab) + p A Vocab object - +attribute("pos / pos_") - | A part-of-speech tag, from the Google Universal Tag Set, e.g. - | code>NOUN, VERB, ADV. Constants for - | the 17 tag values are provided in spacy.parts_of_speech. + +param("doc", types.Doc) + p The parent sequence - +attribute("tag / tag_") - | A morphosyntactic tag, e.g. NN, VBZ, - | DT, etc. These tags are language/corpus specific, and - | typically describe part-of-speech and some amount of morphological - | information. For instance, in the Penn Treebank tag set, VBZ - | is assigned to a present-tense singular verb. + +param("offset", types.int) + p The index of the token within the document - +attribute("dep / dep_") - | The type of syntactic dependency relation between the word and its - | syntactic head. + details + summary: h4 String Views - details - summary: h4 Navigating the Parse Tree - - +attribute("head") - p - | The Token that is the immediate syntactic head of the word. If the - | word is the root of the dependency tree, the same word is returned. + +attribute("orth / orth_") + | The form of the word with no string normalization or processing, as + | it appears in the string, without trailing whitespace. - +attribute("lefts") - p - | An iterator for the immediate leftward syntactic children of the - | word. + +attribute("lemma / lemma_") + | The "base" of the word, with no inflectional suffixes, e.g. the lemma of + | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that + | derivational suffixes are not stripped, e.g. the lemma of + | "instutitions" is "institution", not "institute". Lemmatization is + | performed using the WordNet data, but extended to also cover closed-class + | words such as pronouns. By default, the WN lemmatizer returns "hi" + | as the lemma of "his". We assign pronouns the lemma -PRON-. - +attribute("rights") - p - | An iterator for the immediate rightward syntactic children of the - | word. + +attribute("lower / lower_") + | The form of the word, but forced to lower-case, i.e. + pre.language-python: code lower = word.orth\_.lower() - +attribute("n_lefts") - p - | The number of immediate syntactic children preceding the word in - | the string. + //+attribute("norm / norm_") + // | The form of the word, after language-specific normalizations has been + // | applied. - +attribute("n_rights") - p - | The number of immediate syntactic children following the word in - | the string. + +attribute("shape / shape_") + | A transform of the word's string, to show orthographic features. + | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped + | to d. After these mappings, sequences of 4 or more of the same character + | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, + | :) --> :) - +attribute("children") - p - | An iterator that yields from lefts, and then yields from rights. + +attribute("prefix / prefix_") + | A length-N substring from the start of the word. Length may vary by + | language; currently for English n=1, i.e. + pre.language-python: code prefix = word.orth\_[:1] - +attribute("subtree") - p - | An iterator for the part of the sentence syntactically governed by - | the word, including the word itself. + +attribute("suffix / suffix_") + | A length-N substring from the end of the word. Length may vary by + | language; currently for English n=3, i.e. + pre.language-python: code suffix = word.orth\_[-3:] - +attribute("left_edge") - p The leftmost edge of the token's subtree + //+attribute("lex_id") + // | lex_id - +attribute("right_edge") - p The rightmost edge of the token's subtree + details + summary: h4 Alignment and Output - details - summary: h4 Named Entities + +attribute("idx") + p Start index of the token in the string - +attribute("ent_type") - p If the token is part of an entity, its entity type. + +method("__len__", "") + p Length of the token's orth string, in unicode code-points. - +attribute("ent_iob") - p The IOB (inside, outside, begin) entity recognition tag for the token. + +method("__unicode__", "") + p Same as token.orth_ - details - summary: h4 Lexeme Flags + +method("__str__", "") + p Varies between Python 2 and Python 3 - +method("check_flag", "flag_id") - +params - +param("flag_id") - | flag ID + +attribute("string") + p + | The form of the word as it appears in the string, including + | trailing whitespace. This is useful when you need to use + | linguistic features to add inline mark-up to the string. - +attribute("is_oov") - +attribute("is_alpha") - +attribute("is_ascii") - +attribute("is_digit") - +attribute("is_lower") - +attribute("is_title") - +attribute("is_punct") - +attribute("is_space") - +attribute("like_url") - +attribute("like_num") - +attribute("like_email") + +method("nbor, i=1") + +params + +param("i") + p Offset relative to token + + details + summary: h4 Distributional Features + + +attribute("repvec") + p + | A "word embedding" representation: a dense real-valued vector that supports + | similarity queries between words. By default, spaCy currently loads + | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec + | model. + + +attribute("cluster") + p + | The Brown cluster ID of the word. These are often useful features for + | linear models. If you're using a non-linear model, particularly a + | neural net or random forest, consider using the real-valued word + | representation vector, in Token.repvec, instead. + + +attribute("prob") + p + | The unigram log-probability of the word, estimated from counts from a + | large corpus, smoothed using Simple Good Turing estimation. + + details + summary: h4 Syntactic Tags + + +attribute("pos / pos_") + p + | A part-of-speech tag, from the Google Universal Tag Set, e.g. + | code>NOUN, VERB, ADV. Constants for + | the 17 tag values are provided in spacy.parts_of_speech. + + +attribute("tag / tag_") + p + | A morphosyntactic tag, e.g. NN, VBZ, + | DT, etc. These tags are language/corpus specific, and + | typically describe part-of-speech and some amount of morphological + | information. For instance, in the Penn Treebank tag set, VBZ + | is assigned to a present-tense singular verb. + + +attribute("dep / dep_") + p + | The type of syntactic dependency relation between the word and its + | syntactic head. + + details + summary: h4 Navigating the Parse Tree + + +attribute("head") + p + | The Token that is the immediate syntactic head of the word. If the + | word is the root of the dependency tree, the same word is returned. + + +attribute("lefts") + p + | An iterator for the immediate leftward syntactic children of the + | word. + + +attribute("rights") + p + | An iterator for the immediate rightward syntactic children of the + | word. + + +attribute("n_lefts") + p + | The number of immediate syntactic children preceding the word in + | the string. + + +attribute("n_rights") + p + | The number of immediate syntactic children following the word in + | the string. + + +attribute("children") + p + | An iterator that yields from lefts, and then yields from rights. + + +attribute("subtree") + p + | An iterator for the part of the sentence syntactically governed by + | the word, including the word itself. + + +attribute("left_edge") + p The leftmost edge of the token's subtree + + +attribute("right_edge") + p The rightmost edge of the token's subtree + + details + summary: h4 Named Entities + + +attribute("ent_type") + p If the token is part of an entity, its entity type. + + +attribute("ent_iob") + p The IOB (inside, outside, begin) entity recognition tag for the token. + + details + summary: h4 Lexeme Flags + + +method("check_flag", "flag_id") + +params + +param("flag_id") + | flag ID + + +attribute("is_oov") + +attribute("is_alpha") + +attribute("is_ascii") + +attribute("is_digit") + +attribute("is_lower") + +attribute("is_title") + +attribute("is_punct") + +attribute("is_space") + +attribute("like_url") + +attribute("like_num") + +attribute("like_email") + + //+attribute("conjuncts") + // | Conjuncts + + +declare_class("Span") + +init + +method("__init__") + Temp - //+attribute("conjuncts") - // | Conjuncts + span = doc[0:4] - +declare_class("spacy.tokens.span.Span") - +params - +method("__getitem__") - p Get item - - +method("__iter__") - p Iter - - +method("__len__") - p Len - - +attribute("root") - p Syntactic head - - +attribute("lefts") - p Tokens that are: - ol - li To the left of the span; - li Syntactic children of words within the span - - p i.e. - - pre.language-python - code - | lefts = [span.doc[i] for i in range(0, span.start) - | if span.doc[i].head in span] - - +attribute("rights") - p Tokens that are: - ol - li To the right of the span; + +sequence + +method("__getitem__") + p Get item + + +method("__iter__") + p Iter + + +method("__len__") + p Len + + details + summary: h4 Parse + + +attribute("root") + p Syntactic head + + +attribute("lefts") + p Tokens that are: + ol + li To the left of the span; li Syntactic children of words within the span - p i.e. - pre.language-python - code - | rights = [span.doc[i] for i in range(span.end, len(span.doc)) - | if span.doc[i].head in span] - - +attribute("string") - p String - +attribute("lemma / lemma_") - p String - - +attribute("label / label_") - p String - - +attribute("subtree") - p String - - +declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None") - +method("__len__") - +returns - p Number of words in the vocabulary. + p i.e. - +method("__getitem__", "key_int") - +params - +param("key") - p Integer ID + pre.language-python + code + | lefts = [span.doc[i] for i in range(0, span.start) + | if span.doc[i].head in span] - +returns: p A Lexeme object + +attribute("rights") + p Tokens that are: + ol + li To the right of the span; + li Syntactic children of words within the span + p i.e. + pre.language-python + code + | rights = [span.doc[i] for i in range(span.end, len(span.doc)) + | if span.doc[i].head in span] - +method("__getitem__", "key_str") - +params - +param("key_str", unicode_type) - p A string in the vocabulary - +returns("Lexeme") + +attribute("subtree") + p String - +method("__setitem__", "orth_str", "props") - +params - +param("orth_str", unicode_type) - p The orth key + details + summary: h4 String Views - +param("props", dict_type) - p A props dictionary + +attribute("string") + p String + + +attribute("lemma / lemma_") + p String - +returns("None") + +attribute("label / label_") + p String - +method("dump", "loc") - +params - +param("loc", unicode_type) - p Path where the vocabulary should be saved + +declare_class("Lexeme") + p + | The Lexeme object represents a lexical type, stored in the vocabulary + | – as opposed to a token, occurring in a document. + p + | Lexemes store various features, so that these features can be computed + | once per type, rather than once per token. As job sizes grow, this + | can amount to a substantial efficiency improvement. - +method("load_lexemes", "loc") - +params - +param("loc", unicode_type) - p Path to load the lexemes.bin file from + p + | All Lexeme attributes are therefore context independent, as a single + | lexeme is reused for all usages of that word. Lexemes are keyed by + | the “orth” attribute. - +method("load_vectors", "loc") - +params - +param("loc", unicode_type) - p Path to load the vectors.bin from + p + All Lexeme attributes are accessible directly on the Token object. - - +declare_class("spacy.strings.StringStore") - +method("__len__") - +returns("int") - p Number of strings in the string-store + +init + +method("__init__") + p Init - +method("__getitem__", "key_int") - +params - +param("key_int") - p An integer key + details + summary: h4 String Features - +returns(unicode_type) - p The string that the integer key maps to + +attribute("orth / orth_") + p + | The form of the word with no string normalization or processing, + | as it appears in the string, without trailing whitespace. + + +attribute("lower / lower_") + p Tmp + + +attribute("norm / norm_") + p Tmp + + +attribute("shape / shape_") + p Tmp + + +attribute("prefix / prefix_") + p Tmp + + +attribute("suffix / suffix_") + p TMP - +method("__getitem__", "key_unicode") - +params - +param("key_unicode") - p A key, as a unicode string + +declare_class("Vocab", "data_dir=None, lex_props_getter=None") + +sequence + +method("__len__") + +returns + p Number of words in the vocabulary. - +returns(int_type) - p The integer ID of the string. + +method("__iter__") + +returns + p Lexeme + + +maptype + +method("__getitem__", "key_int") + +params + +param("key") + p Integer ID + + +returns: p A Lexeme object + + +method("__getitem__", "key_str") + +params + +param("key_str", types.unicode) + p A string in the vocabulary + + +returns("Lexeme") + + +method("__setitem__", "orth_str", "props") + +params + +param("orth_str", types.unicode) + p The orth key + + +param("props", types.dict) + p A props dictionary + + +returns("None") - +method("__getitem__", "key_utf8_bytes") - +params - +param("key_utf8_bytes", bytes_type) - p p A key, as a UTF-8 encoded byte-string + details + summary: h4 Import/Export + + +method("dump", "loc") + +params + +param("loc", types.unicode) + p Path where the vocabulary should be saved + + +method("load_lexemes", "loc") + +params + +param("loc", types.unicode) + p Path to load the lexemes.bin file from + + +method("load_vectors", "loc") + +params + +param("loc", types.unicode) + p Path to load the vectors.bin from - +returns(int_type) - p The integer ID of the string. + +declare_class("StringStore") + +init + Tmp - +method("dump", "loc") - +params - +param("loc") - p File path to save the strings.txt to. + +sequence + +method("__len__") + +returns("int") + p Number of strings in the string-store + + +method("__iter__") + +returns + p Lexeme + + +maptype + +method("__getitem__", "key_int") + +params + +param("key_int") + p An integer key + + +returns(types.unicode) + p The string that the integer key maps to + + +method("__getitem__", "key_unicode") + +params + +param("key_unicode") + p A key, as a unicode string + + +returns(types.int) + p The integer ID of the string. + + +method("__getitem__", "key_utf8_bytes") + +params + +param("key_utf8_bytes", types.bytes) + p p A key, as a UTF-8 encoded byte-string + + +returns(types.int) + p The integer ID of the string. + + details + summary: h4 Import/Export + + +method("dump", "loc") + +params + +param("loc") + p File path to save the strings.txt to. + + +method("load") + +params + +param("loc") + p File path to load the strings.txt from. - +method("load") - +params - +param("loc") - p File path to load the strings.txt from. - script(src="js/prism.js")