mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
b000fca8f8
|
@ -42,6 +42,7 @@ p
|
||||||
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
|
+item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
|
||||||
+item #[+a("#norm-exceptions") Norm exceptions]
|
+item #[+a("#norm-exceptions") Norm exceptions]
|
||||||
+item #[+a("#lex-attrs") Lexical attributes]
|
+item #[+a("#lex-attrs") Lexical attributes]
|
||||||
|
+item #[+a("#syntax-iterators") Syntax iterators]
|
||||||
+item #[+a("#lemmatizer") Lemmatizer]
|
+item #[+a("#lemmatizer") Lemmatizer]
|
||||||
+item #[+a("#tag-map") Tag map]
|
+item #[+a("#tag-map") Tag map]
|
||||||
+item #[+a("#morph-rules") Morph rules]
|
+item #[+a("#morph-rules") Morph rules]
|
||||||
|
@ -104,6 +105,13 @@ p
|
||||||
+cell dict
|
+cell dict
|
||||||
+cell Attribute ID mapped to function.
|
+cell Attribute ID mapped to function.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code SYNTAX_ITERATORS]
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| Iterator ID mapped to function. Currently only supports
|
||||||
|
| #[code 'noun_chunks'].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code LOOKUP]
|
+cell #[code LOOKUP]
|
||||||
+cell dict
|
+cell dict
|
||||||
|
@ -341,9 +349,12 @@ p
|
||||||
| a token's norm equals its lowercase text. If the lowercase spelling of a
|
| a token's norm equals its lowercase text. If the lowercase spelling of a
|
||||||
| word exists, norms should always be in lowercase.
|
| word exists, norms should always be in lowercase.
|
||||||
|
|
||||||
+aside-code("Accessing norms").
|
+aside-code("Norms vs. lemmas").
|
||||||
doc = nlp(u"I can't")
|
doc = nlp(u"I'm gonna realise")
|
||||||
assert [t.norm_ for t in doc] == ['i', 'can', 'not']
|
norms = [token.norm_ for token in doc]
|
||||||
|
lemmas = [token.lemma_ for token in doc]
|
||||||
|
assert norms == ['i', 'am', 'going', 'to', 'realize']
|
||||||
|
assert lemmas == ['i', 'be', 'go', 'to', 'realise']
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy usually tries to normalise words with different spellings to a single,
|
| spaCy usually tries to normalise words with different spellings to a single,
|
||||||
|
@ -449,6 +460,33 @@ p
|
||||||
| #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
|
| #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
|
||||||
| are overwritten.
|
| are overwritten.
|
||||||
|
|
||||||
|
+h(3, "syntax-iterators") Syntax iterators
|
||||||
|
|
||||||
|
p
|
||||||
|
| Syntax iterators are functions that compute views of a #[code Doc]
|
||||||
|
| object based on its syntax. At the moment, this data is only used for
|
||||||
|
| extracting
|
||||||
|
| #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which
|
||||||
|
| are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]
|
||||||
|
| property. Because base noun phrases work differently across languages,
|
||||||
|
| the rules to compute them are part of the individual language's data. If
|
||||||
|
| a language does not include a noun chunks iterator, the property won't
|
||||||
|
| be available. For examples, see the existing syntax iterators:
|
||||||
|
|
||||||
|
+aside-code("Noun chunks example").
|
||||||
|
doc = nlp(u'A phrase with another phrase occurs.')
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert chunks[0].text == "A phrase"
|
||||||
|
assert chunks[1].text == "another phrase"
|
||||||
|
|
||||||
|
+table(["Language", "Source"])
|
||||||
|
for lang, lang_id in {en: "English", de: "German", es: "Spanish"}
|
||||||
|
+row
|
||||||
|
+cell=lang
|
||||||
|
+cell
|
||||||
|
+src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py"))
|
||||||
|
| lang/#{lang_id}/syntax_iterators.py
|
||||||
|
|
||||||
+h(3, "lemmatizer") Lemmatizer
|
+h(3, "lemmatizer") Lemmatizer
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -604,6 +642,8 @@ p
|
||||||
|
|
||||||
+h(2, "vocabulary") Building the vocabulary
|
+h(2, "vocabulary") Building the vocabulary
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy expects that common words will be cached in a
|
| spaCy expects that common words will be cached in a
|
||||||
| #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
|
| #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
|
||||||
|
@ -697,6 +737,8 @@ p
|
||||||
|
|
||||||
+h(3, "word-vectors") Training the word vectors
|
+h(3, "word-vectors") Training the word vectors
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
p
|
p
|
||||||
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
|
| #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
|
||||||
| algorithms let you train useful word similarity models from unlabelled
|
| algorithms let you train useful word similarity models from unlabelled
|
||||||
|
@ -731,6 +773,8 @@ p
|
||||||
|
|
||||||
+h(2, "train-tagger-parser") Training the tagger and parser
|
+h(2, "train-tagger-parser") Training the tagger and parser
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
p
|
p
|
||||||
| You can now train the model using a corpus for your language annotated
|
| You can now train the model using a corpus for your language annotated
|
||||||
| with #[+a("http://universaldependencies.org/") Universal Dependencies].
|
| with #[+a("http://universaldependencies.org/") Universal Dependencies].
|
||||||
|
|
Loading…
Reference in New Issue