diff --git a/docs/redesign/spacy_docs.jade b/docs/redesign/spacy_docs.jade
index 5b64dd0da..29f0512e7 100644
--- a/docs/redesign/spacy_docs.jade
+++ b/docs/redesign/spacy_docs.jade
@@ -1,17 +1,19 @@
-- var unicode_type = 'unicode'
-- var bool_type = 'bool'
-
-- var int_type = ""
-
-- var Token_type = ""
-- var Span_type = ""
-- var Vocab_type = ""
-- var generator_type = ""
+- var py_docs = 'unicode',
+ 'bool': py_docs + 'functions.html#bool">bool',
+ 'int': py_docs + 'functions.html#int">int',
+ 'generator': "",
+ 'Vocab': "",
+ 'Span': "",
+ 'Doc': ""
+ }
mixin declare_class(name)
- details(open="true")
+ details
summary
span.declaration
span.label class
@@ -62,14 +64,54 @@ mixin returns(name, type, value)
mixin returns(type)
| tmp
+mixin init
+ details
+ summary: h4 Init
+ block
+
+
+mixin callable
+ details
+ summary: h4 Callable
+
+ block
+
+
+mixin sequence
+ details
+ summary: h4 Sequence
+
+ block
+
+
+mixin maptype
+ details
+ summary: h4 Map
+
+ block
+
+
+mixin summary
+ block
+
+mixin en_example
+ pre.language-python
+ code
+ | from spacy.en import English
+ | from spacy._doc_examples import download_war_and_peace
+ |
+ | unprocessed_unicode = download_war_and_peace()
+ |
+ | nlp = English()
+ | doc = nlp(unprocessed_unicode)
doctype html
html(lang="en")
head
meta(charset="utf-8")
- title!= tag_line
+ title spaCy – Industrial-strength NLP
meta(name="description" content="")
meta(name="author" content="Matthew Honnibal")
link(rel="stylesheet" href="css/style.css")
@@ -78,9 +120,9 @@ html(lang="en")
body(id="docs")
- header
- h1.logo!= tag_line
- div.slogan!= slogan
+ header(role="banner")
+ h1.logo spaCy – Industrial-strength NLP
+ div.slogan API
nav(role="navigation")
@@ -91,473 +133,573 @@ html(lang="en")
li: a(href="#") Blog
main.docs#content
- section.intro
- | Tmp
article
- h3: a(href="#") Header
+ +declare_class("English")
+ p Load models into a callable object to process English text.
- +declare_class("spacy.en.English")
- +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
+ +summary
+ +en_example
- +params
- +param("data_dir")
- | The data directory. May be #{None}, to disable any data loading
- | (including the vocabulary).
+ +init
+ p
+ | Load the resources. Loading takes 20 seconds, and the instance
+ | consumes 2 to 3 gigabytes of memory.
+
+ p
+ | Intended use is for one instance to be created per process.
+ | You can create more if you're doing something unusual.
+ p
+ | You may wish to make the instance a global variable or "singleton".
+ | We usually instantiate the object in the main()
+ | function and pass it around as an explicit argument.
+ +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
- +param("Tokenizer")
- | A class/function that creates the tokenizer.
+ +params
+ +param("data_dir")
+ | The data directory. May be #{None}, to disable any data loading
+ | (including the vocabulary).
- +param("Tagger")
- | A class/function that creates the part-of-speech tagger.
+ +param("Tokenizer")
+ | A class/function that creates the tokenizer.
- +param("Parser")
- | A class/function that creates the dependency parser.
+ +param("Tagger")
+ | A class/function that creates the part-of-speech tagger.
- +param("Entity")
- | A class/function that creates the named entity recogniser.
+ +param("Parser")
+ | A class/function that creates the dependency parser.
- +param("load_vectors")
- | A boolean value to control whether the word vectors are loaded.
+ +param("Entity")
+ | A class/function that creates the named entity recogniser.
+ +param("load_vectors")
+ | A boolean value to control whether the word vectors are loaded.
+
+ +callable
+ +method("__call__", "text, tag=True, parse=True, entity=True")
- +method("__call__", "text, tag=True, parse=True, entity=True")(open)
+ +params
+ +param("text", types.unicode)
+ | The text to be processed. No pre-processing needs to be applied,
+ | and any length of text can be submitted. Usually you will submit
+ | a whole document. Text may be zero-length. An exception is raised
+ | if byte strings are supplied.
- +params
- +param("text", unicode_type)
- | The text to be processed. No pre-processing needs to be applied,
- | and any length of text can be submitted. Usually you will submit
- | a whole document. Text may be zero-length. An exception is raised
- | if byte strings are supplied.
+ +param("tag", bool_type)
+ | Whether to apply the part-of-speech tagger. Required for parsing
+ | and entity recognition.
- +param("tag", bool_type)
- | Whether to apply the part-of-speech tagger. Required for parsing
- | and entity recognition.
+ +param("parse", bool_type)
+ | Whether to apply the syntactic dependency parser.
- +param("parse", bool_type)
- | Whether to apply the syntactic dependency parser.
+ +param("entity", bool_type)
+ | Whether to apply the named entity recognizer.
- +param("entity", bool_type)
- | Whether to apply the named entity recognizer.
+ pre.language-python
+ code
+ | from spacy.en import English
+ | nlp = English()
+ | doc = nlp(u'Some text.) # Applies tagger, parser, entity
+ | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
+ | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
+ | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
+ | doc = nlp(u'') # Zero-length tokens, not an error
+ | # doc = nlp(b'Some text') <-- Error: need unicode
+ | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+
+ +declare_class("Doc")
+ p I'm a doc
+
+ +init
+ +method("__init__", "vocab")
+ +params
+ +param("vocab", vocab_type)
+ | A vocabulary object
+
+ +sequence
+ +method("__getitem__", "i", types.int)
+ +returns(types.Token)
+
+ +method("__getitem__", "start_end", types.slice)
+ +returns(types.Span)
+
+ +method("__iter__")
+ | Iterate over tokens
+
+ +method("__len__")
+ | Number of tokens in the document.
+
+ details
+ summary: h4 Spans
+
+ +attribute("sents", types.generator)
+ | Iterate over sentences in the document.
+
+ +attribute("ents", types.generator)
+ | Iterate over named entities in the document.
+
+ +attribute("noun_chunks", types.generator)
+
+ details
+ summary: h4 Export/Import
+
+ +method("to_array", "attr_ids")
+
+ | Given a list of M attribute IDs, export the tokens to a numpy ndarray
+ | of shape N*M, where N is the length of the sentence.
+
+ +params
+ +param("attr_ids", "list[int]")
+ | A list of attribute ID ints.
+
+ +returns("feat_array")
+ | A feature matrix, with one row per word, and one column per attribute
+ | indicated in the input attr_ids.
+
+ +method("count_by", "attr_id")
+ | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+ | by the values of the given attribute ID.
+
pre.language-python
code
- | from spacy.en import English
- | nlp = English()
- | doc = nlp(u'Some text.) # Applies tagger, parser, entity
- | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
- | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
- | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
- | doc = nlp(u'') # Zero-length tokens, not an error
- | # doc = nlp(b'Some text') <-- Error: need unicode
- | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
-
- +declare_class("spacy.tokens.doc.Doc")
- +method("__init__", "vocab")
- +params
- +param("vocab", vocab_type)
- | A vocabulary object
-
- +method("__getitem__", "i", int_type)
- +returns(Token_type)
-
- +method("__getitem__", "start_end", slice_type)
- +returns(Span_type)
-
- +method("__iter__")
- | Iterate over tokens
-
- +method("__len__")
- | Number of tokens in the document.
-
- +attribute("sents", generator_type)
- | Iterate over sentences in the document.
-
- +attribute("ents", generator_type)
- | Iterate over named entities in the document.
+ | >>> from spacy.en import English, attrs
+ | >>> nlp = English()
+ | >>> tokens = nlp(u'apple apple orange banana')
+ | >>> tokens.count_by(attrs.ORTH)
+ | {12800L: 1, 11880L: 2, 7561L: 1}
+ | >>> tokens.to_array([attrs.ORTH])
+ | array([[11880],
+ | [11880],
+ | [7561],
+ | [12800]])
- +attribute("noun_chunks", generator_type)
-
- +method("to_array", "attr_ids")
-
- | Given a list of M attribute IDs, export the tokens to a numpy ndarray
- | of shape N*M, where N is the length of the sentence.
-
- +params
-
- +param("attr_ids", "list[int]")
- | A list of attribute ID ints.
-
- +returns("feat_array")
- | A feature matrix, with one row per word, and one column per attribute
- | indicated in the input attr_ids.
-
- +method("count_by", "attr_id")
-
- | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
- | by the values of the given attribute ID.
+ +method("from_array", "attrs, array")
+ | Load from array
- pre.language-python
- code
- | >>> from spacy.en import English, attrs
- | >>> nlp = English()
- | >>> tokens = nlp(u'apple apple orange banana')
- | >>> tokens.count_by(attrs.ORTH)
- | {12800L: 1, 11880L: 2, 7561L: 1}
- | >>> tokens.to_array([attrs.ORTH])
- | array([[11880],
- | [11880],
- | [7561],
- | [12800]])
-
- +method("from_array", "attrs, array")
- | Load from array
-
- +method("to_bytes")
- | Serialize
-
- +method("from_bytes")
- | Deserialize, loading from bytes
-
- +method("read_bytes")
- | classmethod
-
- +method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
-
- | Merge a multi-word expression into a single token. Currently
- | experimental; API is likely to change.
-
-
- +declare_class("spacy.tokens.Token")
- +method("__init__", "vocab, doc, offset")
- +params
- +param("vocab", Vocab_type)
- p A Vocab object
-
- +param("doc", Doc_type)
- p The parent sequence
-
- +param("offset", Int_type)
- p The index of the token within the document
-
- details
- summary: h4 String Views
-
- +attribute("orth / orth_")
- | The form of the word with no string normalization or processing, as
- | it appears in the string, without trailing whitespace.
-
- +attribute("lemma / lemma_")
- | The "base" of the word, with no inflectional suffixes, e.g. the lemma of
- | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
- | derivational suffixes are not stripped, e.g. the lemma of
- | "instutitions" is "institution", not "institute". Lemmatization is
- | performed using the WordNet data, but extended to also cover closed-class
- | words such as pronouns. By default, the WN lemmatizer returns "hi"
- | as the lemma of "his". We assign pronouns the lemma -PRON-.
-
- +attribute("lower / lower_")
- | The form of the word, but forced to lower-case, i.e.
- pre.language-python: code lower = word.orth\_.lower()
-
- //+attribute("norm / norm_")
- // | The form of the word, after language-specific normalizations has been
- // | applied.
-
- +attribute("shape / shape_")
- | A transform of the word's string, to show orthographic features.
- | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
- | to d. After these mappings, sequences of 4 or more of the same character
- | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
- | :) --> :)
-
- +attribute("prefix / prefix_")
- | A length-N substring from the start of the word. Length may vary by
- | language; currently for English n=1, i.e.
- pre.language-python: code prefix = word.orth\_[:1]
-
- +attribute("suffix / suffix_")
- | A length-N substring from the end of the word. Length may vary by
- | language; currently for English n=3, i.e.
- pre.language-python: code suffix = word.orth\_[-3:]
-
- //+attribute("lex_id")
- // | lex_id
-
- details
- summary: h4 Alignment and Output
-
- +attribute("idx")
- p Start index of the token in the string
-
- +method("__len__", "")
- p Length of the token's orth string, in unicode code-points.
-
- +method("__unicode__", "")
- p Same as token.orth_
-
- +method("__str__", "")
- p Varies between Python 2 and Python 3
-
- +attribute("string")
- p
- | The form of the word as it appears in the string, including
- | trailing whitespace. This is useful when you need to use
- | linguistic features to add inline mark-up to the string.
-
- +method("nbor, i=1")
- +params
- +param("i")
- p Offset relative to token
+ +method("from_bytes")
+ | Deserialize, loading from bytes
- details
- summary: h4 Distributional Features
+ +method("read_bytes")
+ | classmethod
- +attribute("repvec")
- p
- | A "word embedding" representation: a dense real-valued vector that supports
- | similarity queries between words. By default, spaCy currently loads
- | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
- | model.
+ //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
- +attribute("cluster")
- p
- | The Brown cluster ID of the word. These are often useful features for
- | linear models. If you're using a non-linear model, particularly a
- | neural net or random forest, consider using the real-valued word
- | representation vector, in Token.repvec, instead.
+ // | Merge a multi-word expression into a single token. Currently
+ // | experimental; API is likely to change.
- +attribute("prob")
- p
- | The unigram log-probability of the word, estimated from counts from a
- | large corpus, smoothed using Simple Good Turing estimation.
- details
- summary: h4 Syntactic Tags
+ +declare_class("Token")
+ +init
+ +method("__init__", "vocab, doc, offset")
+ +params
+ +param("vocab", types.Vocab)
+ p A Vocab object
- +attribute("pos / pos_")
- | A part-of-speech tag, from the Google Universal Tag Set, e.g.
- | code>NOUN, VERB
, ADV
. Constants for
- | the 17 tag values are provided in spacy.parts_of_speech.
+ +param("doc", types.Doc)
+ p The parent sequence
- +attribute("tag / tag_")
- | A morphosyntactic tag, e.g. NN
, VBZ
,
- | DT
, etc. These tags are language/corpus specific, and
- | typically describe part-of-speech and some amount of morphological
- | information. For instance, in the Penn Treebank tag set, VBZ
- | is assigned to a present-tense singular verb.
+ +param("offset", types.int)
+ p The index of the token within the document
- +attribute("dep / dep_")
- | The type of syntactic dependency relation between the word and its
- | syntactic head.
+ details
+ summary: h4 String Views
- details
- summary: h4 Navigating the Parse Tree
-
- +attribute("head")
- p
- | The Token that is the immediate syntactic head of the word. If the
- | word is the root of the dependency tree, the same word is returned.
+ +attribute("orth / orth_")
+ | The form of the word with no string normalization or processing, as
+ | it appears in the string, without trailing whitespace.
- +attribute("lefts")
- p
- | An iterator for the immediate leftward syntactic children of the
- | word.
+ +attribute("lemma / lemma_")
+ | The "base" of the word, with no inflectional suffixes, e.g. the lemma of
+ | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
+ | derivational suffixes are not stripped, e.g. the lemma of
+ | "instutitions" is "institution", not "institute". Lemmatization is
+ | performed using the WordNet data, but extended to also cover closed-class
+ | words such as pronouns. By default, the WN lemmatizer returns "hi"
+ | as the lemma of "his". We assign pronouns the lemma -PRON-.
- +attribute("rights")
- p
- | An iterator for the immediate rightward syntactic children of the
- | word.
+ +attribute("lower / lower_")
+ | The form of the word, but forced to lower-case, i.e.
+ pre.language-python: code lower = word.orth\_.lower()
- +attribute("n_lefts")
- p
- | The number of immediate syntactic children preceding the word in
- | the string.
+ //+attribute("norm / norm_")
+ // | The form of the word, after language-specific normalizations has been
+ // | applied.
- +attribute("n_rights")
- p
- | The number of immediate syntactic children following the word in
- | the string.
+ +attribute("shape / shape_")
+ | A transform of the word's string, to show orthographic features.
+ | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
+ | to d. After these mappings, sequences of 4 or more of the same character
+ | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
+ | :) --> :)
- +attribute("children")
- p
- | An iterator that yields from lefts, and then yields from rights.
+ +attribute("prefix / prefix_")
+ | A length-N substring from the start of the word. Length may vary by
+ | language; currently for English n=1, i.e.
+ pre.language-python: code prefix = word.orth\_[:1]
- +attribute("subtree")
- p
- | An iterator for the part of the sentence syntactically governed by
- | the word, including the word itself.
+ +attribute("suffix / suffix_")
+ | A length-N substring from the end of the word. Length may vary by
+ | language; currently for English n=3, i.e.
+ pre.language-python: code suffix = word.orth\_[-3:]
- +attribute("left_edge")
- p The leftmost edge of the token's subtree
+ //+attribute("lex_id")
+ // | lex_id
- +attribute("right_edge")
- p The rightmost edge of the token's subtree
+ details
+ summary: h4 Alignment and Output
- details
- summary: h4 Named Entities
+ +attribute("idx")
+ p Start index of the token in the string
- +attribute("ent_type")
- p If the token is part of an entity, its entity type.
+ +method("__len__", "")
+ p Length of the token's orth string, in unicode code-points.
- +attribute("ent_iob")
- p The IOB (inside, outside, begin) entity recognition tag for the token.
+ +method("__unicode__", "")
+ p Same as token.orth_
- details
- summary: h4 Lexeme Flags
+ +method("__str__", "")
+ p Varies between Python 2 and Python 3
- +method("check_flag", "flag_id")
- +params
- +param("flag_id")
- | flag ID
+ +attribute("string")
+ p
+ | The form of the word as it appears in the string, including
+ | trailing whitespace. This is useful when you need to use
+ | linguistic features to add inline mark-up to the string.
- +attribute("is_oov")
- +attribute("is_alpha")
- +attribute("is_ascii")
- +attribute("is_digit")
- +attribute("is_lower")
- +attribute("is_title")
- +attribute("is_punct")
- +attribute("is_space")
- +attribute("like_url")
- +attribute("like_num")
- +attribute("like_email")
+ +method("nbor, i=1")
+ +params
+ +param("i")
+ p Offset relative to token
+
+ details
+ summary: h4 Distributional Features
+
+ +attribute("repvec")
+ p
+ | A "word embedding" representation: a dense real-valued vector that supports
+ | similarity queries between words. By default, spaCy currently loads
+ | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
+ | model.
+
+ +attribute("cluster")
+ p
+ | The Brown cluster ID of the word. These are often useful features for
+ | linear models. If you're using a non-linear model, particularly a
+ | neural net or random forest, consider using the real-valued word
+ | representation vector, in Token.repvec, instead.
+
+ +attribute("prob")
+ p
+ | The unigram log-probability of the word, estimated from counts from a
+ | large corpus, smoothed using Simple Good Turing estimation.
+
+ details
+ summary: h4 Syntactic Tags
+
+ +attribute("pos / pos_")
+ p
+ | A part-of-speech tag, from the Google Universal Tag Set, e.g.
+ | code>NOUN, VERB
, ADV
. Constants for
+ | the 17 tag values are provided in spacy.parts_of_speech.
+
+ +attribute("tag / tag_")
+ p
+ | A morphosyntactic tag, e.g. NN
, VBZ
,
+ | DT
, etc. These tags are language/corpus specific, and
+ | typically describe part-of-speech and some amount of morphological
+ | information. For instance, in the Penn Treebank tag set, VBZ
+ | is assigned to a present-tense singular verb.
+
+ +attribute("dep / dep_")
+ p
+ | The type of syntactic dependency relation between the word and its
+ | syntactic head.
+
+ details
+ summary: h4 Navigating the Parse Tree
+
+ +attribute("head")
+ p
+ | The Token that is the immediate syntactic head of the word. If the
+ | word is the root of the dependency tree, the same word is returned.
+
+ +attribute("lefts")
+ p
+ | An iterator for the immediate leftward syntactic children of the
+ | word.
+
+ +attribute("rights")
+ p
+ | An iterator for the immediate rightward syntactic children of the
+ | word.
+
+ +attribute("n_lefts")
+ p
+ | The number of immediate syntactic children preceding the word in
+ | the string.
+
+ +attribute("n_rights")
+ p
+ | The number of immediate syntactic children following the word in
+ | the string.
+
+ +attribute("children")
+ p
+ | An iterator that yields from lefts, and then yields from rights.
+
+ +attribute("subtree")
+ p
+ | An iterator for the part of the sentence syntactically governed by
+ | the word, including the word itself.
+
+ +attribute("left_edge")
+ p The leftmost edge of the token's subtree
+
+ +attribute("right_edge")
+ p The rightmost edge of the token's subtree
+
+ details
+ summary: h4 Named Entities
+
+ +attribute("ent_type")
+ p If the token is part of an entity, its entity type.
+
+ +attribute("ent_iob")
+ p The IOB (inside, outside, begin) entity recognition tag for the token.
+
+ details
+ summary: h4 Lexeme Flags
+
+ +method("check_flag", "flag_id")
+ +params
+ +param("flag_id")
+ | flag ID
+
+ +attribute("is_oov")
+ +attribute("is_alpha")
+ +attribute("is_ascii")
+ +attribute("is_digit")
+ +attribute("is_lower")
+ +attribute("is_title")
+ +attribute("is_punct")
+ +attribute("is_space")
+ +attribute("like_url")
+ +attribute("like_num")
+ +attribute("like_email")
+
+ //+attribute("conjuncts")
+ // | Conjuncts
+
+ +declare_class("Span")
+ +init
+ +method("__init__")
+ Temp
- //+attribute("conjuncts")
- // | Conjuncts
+ span = doc[0:4]
- +declare_class("spacy.tokens.span.Span")
- +params
- +method("__getitem__")
- p Get item
-
- +method("__iter__")
- p Iter
-
- +method("__len__")
- p Len
-
- +attribute("root")
- p Syntactic head
-
- +attribute("lefts")
- p Tokens that are:
- ol
- li To the left of the span;
- li Syntactic children of words within the span
-
- p i.e.
-
- pre.language-python
- code
- | lefts = [span.doc[i] for i in range(0, span.start)
- | if span.doc[i].head in span]
-
- +attribute("rights")
- p Tokens that are:
- ol
- li To the right of the span;
+ +sequence
+ +method("__getitem__")
+ p Get item
+
+ +method("__iter__")
+ p Iter
+
+ +method("__len__")
+ p Len
+
+ details
+ summary: h4 Parse
+
+ +attribute("root")
+ p Syntactic head
+
+ +attribute("lefts")
+ p Tokens that are:
+ ol
+ li To the left of the span;
li Syntactic children of words within the span
- p i.e.
- pre.language-python
- code
- | rights = [span.doc[i] for i in range(span.end, len(span.doc))
- | if span.doc[i].head in span]
-
- +attribute("string")
- p String
- +attribute("lemma / lemma_")
- p String
-
- +attribute("label / label_")
- p String
-
- +attribute("subtree")
- p String
-
- +declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
- +method("__len__")
- +returns
- p Number of words in the vocabulary.
+ p i.e.
- +method("__getitem__", "key_int")
- +params
- +param("key")
- p Integer ID
+ pre.language-python
+ code
+ | lefts = [span.doc[i] for i in range(0, span.start)
+ | if span.doc[i].head in span]
- +returns: p A Lexeme object
+ +attribute("rights")
+ p Tokens that are:
+ ol
+ li To the right of the span;
+ li Syntactic children of words within the span
+ p i.e.
+ pre.language-python
+ code
+ | rights = [span.doc[i] for i in range(span.end, len(span.doc))
+ | if span.doc[i].head in span]
- +method("__getitem__", "key_str")
- +params
- +param("key_str", unicode_type)
- p A string in the vocabulary
- +returns("Lexeme")
+ +attribute("subtree")
+ p String
- +method("__setitem__", "orth_str", "props")
- +params
- +param("orth_str", unicode_type)
- p The orth key
+ details
+ summary: h4 String Views
- +param("props", dict_type)
- p A props dictionary
+ +attribute("string")
+ p String
+
+ +attribute("lemma / lemma_")
+ p String
- +returns("None")
+ +attribute("label / label_")
+ p String
- +method("dump", "loc")
- +params
- +param("loc", unicode_type)
- p Path where the vocabulary should be saved
+ +declare_class("Lexeme")
+ p
+ | The Lexeme object represents a lexical type, stored in the vocabulary
+ | – as opposed to a token, occurring in a document.
+ p
+ | Lexemes store various features, so that these features can be computed
+ | once per type, rather than once per token. As job sizes grow, this
+ | can amount to a substantial efficiency improvement.
- +method("load_lexemes", "loc")
- +params
- +param("loc", unicode_type)
- p Path to load the lexemes.bin file from
+ p
+ | All Lexeme attributes are therefore context independent, as a single
+ | lexeme is reused for all usages of that word. Lexemes are keyed by
+ | the “orth” attribute.
- +method("load_vectors", "loc")
- +params
- +param("loc", unicode_type)
- p Path to load the vectors.bin from
+ p
+ All Lexeme attributes are accessible directly on the Token object.
-
- +declare_class("spacy.strings.StringStore")
- +method("__len__")
- +returns("int")
- p Number of strings in the string-store
+ +init
+ +method("__init__")
+ p Init
- +method("__getitem__", "key_int")
- +params
- +param("key_int")
- p An integer key
+ details
+ summary: h4 String Features
- +returns(unicode_type)
- p The string that the integer key maps to
+ +attribute("orth / orth_")
+ p
+ | The form of the word with no string normalization or processing,
+ | as it appears in the string, without trailing whitespace.
+
+ +attribute("lower / lower_")
+ p Tmp
+
+ +attribute("norm / norm_")
+ p Tmp
+
+ +attribute("shape / shape_")
+ p Tmp
+
+ +attribute("prefix / prefix_")
+ p Tmp
+
+ +attribute("suffix / suffix_")
+ p TMP
- +method("__getitem__", "key_unicode")
- +params
- +param("key_unicode")
- p A key, as a unicode string
+ +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
+ +sequence
+ +method("__len__")
+ +returns
+ p Number of words in the vocabulary.
- +returns(int_type)
- p The integer ID of the string.
+ +method("__iter__")
+ +returns
+ p Lexeme
+
+ +maptype
+ +method("__getitem__", "key_int")
+ +params
+ +param("key")
+ p Integer ID
+
+ +returns: p A Lexeme object
+
+ +method("__getitem__", "key_str")
+ +params
+ +param("key_str", types.unicode)
+ p A string in the vocabulary
+
+ +returns("Lexeme")
+
+ +method("__setitem__", "orth_str", "props")
+ +params
+ +param("orth_str", types.unicode)
+ p The orth key
+
+ +param("props", types.dict)
+ p A props dictionary
+
+ +returns("None")
- +method("__getitem__", "key_utf8_bytes")
- +params
- +param("key_utf8_bytes", bytes_type)
- p p A key, as a UTF-8 encoded byte-string
+ details
+ summary: h4 Import/Export
+
+ +method("dump", "loc")
+ +params
+ +param("loc", types.unicode)
+ p Path where the vocabulary should be saved
+
+ +method("load_lexemes", "loc")
+ +params
+ +param("loc", types.unicode)
+ p Path to load the lexemes.bin file from
+
+ +method("load_vectors", "loc")
+ +params
+ +param("loc", types.unicode)
+ p Path to load the vectors.bin from
- +returns(int_type)
- p The integer ID of the string.
+ +declare_class("StringStore")
+ +init
+ Tmp
- +method("dump", "loc")
- +params
- +param("loc")
- p File path to save the strings.txt to.
+ +sequence
+ +method("__len__")
+ +returns("int")
+ p Number of strings in the string-store
+
+ +method("__iter__")
+ +returns
+ p Lexeme
+
+ +maptype
+ +method("__getitem__", "key_int")
+ +params
+ +param("key_int")
+ p An integer key
+
+ +returns(types.unicode)
+ p The string that the integer key maps to
+
+ +method("__getitem__", "key_unicode")
+ +params
+ +param("key_unicode")
+ p A key, as a unicode string
+
+ +returns(types.int)
+ p The integer ID of the string.
+
+ +method("__getitem__", "key_utf8_bytes")
+ +params
+ +param("key_utf8_bytes", types.bytes)
+ p p A key, as a UTF-8 encoded byte-string
+
+ +returns(types.int)
+ p The integer ID of the string.
+
+ details
+ summary: h4 Import/Export
+
+ +method("dump", "loc")
+ +params
+ +param("loc")
+ p File path to save the strings.txt to.
+
+ +method("load")
+ +params
+ +param("loc")
+ p File path to load the strings.txt from.
- +method("load")
- +params
- +param("loc")
- p File path to load the strings.txt from.
-
script(src="js/prism.js")