From e9816daa6a00d3d252595007316f5b32798a33e5 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 4 Jun 2017 23:16:33 +0200 Subject: [PATCH 1/5] Add details on syntax iterators --- website/docs/usage/adding-languages.jade | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index cbde248cc..12ae0c50e 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -42,6 +42,7 @@ p +item #[+a("#tokenizer-exceptions") Tokenizer exceptions] +item #[+a("#norm-exceptions") Norm exceptions] +item #[+a("#lex-attrs") Lexical attributes] + +item #[+a("#syntax-iterators") Syntax iterators] +item #[+a("#lemmatizer") Lemmatizer] +item #[+a("#tag-map") Tag map] +item #[+a("#morph-rules") Morph rules] @@ -104,6 +105,13 @@ p +cell dict +cell Attribute ID mapped to function. + +row + +cell #[code SYNTAX_ITERATORS] + +cell dict + +cell + | Iterator ID mapped to function. Currently only supports + | #[code 'noun_chunks']. + +row +cell #[code LOOKUP] +cell dict @@ -449,6 +457,33 @@ p | #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions | are overwritten. ++h(3, "syntax-iterators") Syntax iterators + +p + | Syntax iterators are functions that compute views of a #[code Doc] + | object based on its syntax. At the moment, this data is only used for + | extracting + | #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which + | are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]] + | property. Because base noun phrases work differently across languages, + | the rules to compute them are part of the individual language's data. If + | a language does not include a noun chunks iterator, the property won't + | be available. For examples, see the existing syntax iterators: + ++aside-code("Noun chunks example"). + doc = nlp(u'A phrase with another phrase occurs.') + chunks = list(doc.noun_chunks) + assert chunks[0].text == "A phrase" + assert chunks[1].text == "another phrase" + ++table(["Language", "Source"]) + for lang, lang_id in {en: "English", de: "German", es: "Spanish"} + +row + +cell=lang + +cell + +src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py")) + | lang/#{lang_id}/syntax_iterators.py + +h(3, "lemmatizer") Lemmatizer p From 47d066b2933e43376087995a8ed20bc436ac820d Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 4 Jun 2017 23:17:54 +0200 Subject: [PATCH 2/5] Add under construction --- website/docs/usage/adding-languages.jade | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 12ae0c50e..fac75dca4 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -639,6 +639,8 @@ p +h(2, "vocabulary") Building the vocabulary ++under-construction + p | spaCy expects that common words will be cached in a | #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical @@ -732,6 +734,8 @@ p +h(3, "word-vectors") Training the word vectors ++under-construction + p | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related | algorithms let you train useful word similarity models from unlabelled @@ -766,6 +770,8 @@ p +h(2, "train-tagger-parser") Training the tagger and parser ++under-construction + p | You can now train the model using a corpus for your language annotated | with #[+a("http://universaldependencies.org/") Universal Dependencies]. From a857b2b511e54795a04a5a02834dcea0a3e70309 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 4 Jun 2017 23:21:37 +0200 Subject: [PATCH 3/5] Update norms example --- website/docs/usage/adding-languages.jade | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index fac75dca4..5052eb2b7 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -350,8 +350,9 @@ p | word exists, norms should always be in lowercase. +aside-code("Accessing norms"). - doc = nlp(u"I can't") - assert [t.norm_ for t in doc] == ['i', 'can', 'not'] + doc = nlp(u"I'm gonna") + norms = [token.norm_ for token in doc] + assert norms == ['i', 'am', 'going', 'to'] p | spaCy usually tries to normalise words with different spellings to a single, From f8e93b6d0a346e9a53dac2e70e5f1712d40d6e1e Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 4 Jun 2017 23:24:29 +0200 Subject: [PATCH 4/5] Update norms example --- website/docs/usage/adding-languages.jade | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 5052eb2b7..cc90db505 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -349,10 +349,12 @@ p | a token's norm equals its lowercase text. If the lowercase spelling of a | word exists, norms should always be in lowercase. -+aside-code("Accessing norms"). ++aside-code("Norms vs. lemmas"). doc = nlp(u"I'm gonna") norms = [token.norm_ for token in doc] + lemmas = [token.lemma_ for token in doc] assert norms == ['i', 'am', 'going', 'to'] + assert lemmas == ['i', 'be', 'go', 'to'] p | spaCy usually tries to normalise words with different spellings to a single, From 505d43b832cb64028b043461c621b24fa6c188af Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 4 Jun 2017 23:33:26 +0200 Subject: [PATCH 5/5] Update norms example --- website/docs/usage/adding-languages.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index cc90db505..a0b77ad17 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -350,11 +350,11 @@ p | word exists, norms should always be in lowercase. +aside-code("Norms vs. lemmas"). - doc = nlp(u"I'm gonna") + doc = nlp(u"I'm gonna realise") norms = [token.norm_ for token in doc] lemmas = [token.lemma_ for token in doc] - assert norms == ['i', 'am', 'going', 'to'] - assert lemmas == ['i', 'be', 'go', 'to'] + assert norms == ['i', 'am', 'going', 'to', 'realize'] + assert lemmas == ['i', 'be', 'go', 'to', 'realise'] p | spaCy usually tries to normalise words with different spellings to a single,