diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 443ee9a67..f3f996846 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -27,8 +27,7 @@ "GoldCorpus": "goldcorpus" }, "Other": { - "Annotation Specs": "annotation", - "Feature Scheme": "features" + "Annotation Specs": "annotation" } }, @@ -143,9 +142,5 @@ "annotation": { "title": "Annotation Specifications" - }, - - "features": { - "title": "Linear Model Feature Scheme" } } diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade deleted file mode 100644 index 018790145..000000000 --- a/website/docs/api/features.jade +++ /dev/null @@ -1,138 +0,0 @@ -//- 💫 DOCS > API > LINEAR MOEL FEATURES - -include ../../_includes/_mixins - -p - | There are two popular strategies for putting together machine learning - | models for NLP: sparse linear models, and neural networks. To solve NLP - | problems with linear models, feature templates need to be assembled that - | combine multiple atomic predictors. This page documents the atomic - | predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]], - | #[+api("tagger") #[code Tagger]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]. - -p - | To understand the scheme, recall that spaCy's #[code Parser] and - | #[code EntityRecognizer] are implemented as push-down automata. They - | maintain a "stack" that holds the current entity, and a "buffer" - | consisting of the words to be processed. - -p - | Each state consists of the words on the stack (if any), which consistute - | the current entity being constructed. We also have the current word, and - | the two subsequent words. Finally, we also have the entities previously - | built. - -p - | This gives us a number of tokens to ask questions about, to make the - | features. About each of these tokens, we can ask about a number of - | different properties. Each feature identifier asks about a specific - | property of a specific token of the context. - -+h(2, "tokens") Context tokens - -+table([ "ID", "Description" ]) - +row - +cell #[code S0] - +cell - | The first word on the stack, i.e. the token most recently added - | to the current entity. - - +row - +cell #[code S1] - +cell The second word on the stack, i.e. the second most recently added. - - +row - +cell #[code S2] - +cell The third word on the stack, i.e. the third most recently added. - - +row - +cell #[code N0] - +cell The first word of the buffer, i.e. the current word being tagged. - - +row - +cell #[code N1] - +cell The second word of the buffer. - - +row - +cell #[code N2] - +cell The third word of the buffer. - - +row - +cell #[code P1] - +cell The word immediately before #[code N0]. - - +row - +cell #[code P2] - +cell The second word before #[code N0]. - - +row - +cell #[code E0] - +cell The first word of the previously constructed entity. - - +row - +cell #[code E1] - +cell The first word of the second previously constructed entity. - -p About each of these tokens, we can ask: - -+table([ "ID", "Attribute", "Description" ]) - +row - +cell #[code N0w] - +cell #[code token.orth] - +cell The word form. - - +row - +cell #[code N0W] - +cell #[code token.lemma] - +cell The word's lemma. - - +row - +cell #[code N0p] - +cell #[code token.tag] - +cell The word's (full) POS tag. - - +row - +cell #[code N0c] - +cell #[code token.cluster] - +cell The word's (full) Brown cluster. - - +row - +cell #[code N0c4] - +cell - - +cell First four digit prefix of the word's Brown cluster. - - +row - +cell #[code N0c6] - +cell - - +cell First six digit prefix of the word's Brown cluster. - - +row - +cell #[code N0L] - +cell - - +cell The word's dependency label. Not used as a feature in the NER. - - +row - +cell #[code N0_prefix] - +cell #[code token.prefix] - +cell The first three characters of the word. - - +row - +cell #[code N0_suffix] - +cell #[code token.suffix] - +cell The last three characters of the word. - - +row - +cell #[code N0_shape] - +cell #[code token.shape] - +cell The word's shape, i.e. is it alphabetic, numeric, etc. - - +row - +cell #[code N0_ne_iob] - +cell #[code token.ent_iob] - +cell The Inside/Outside/Begin code of the word's NER tag. - - +row - +cell #[code N0_ne_type] - +cell #[code token.ent_type] - +cell The word's NER type. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 4d065522b..3a24a38df 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -15,9 +15,9 @@ "Custom tokenization": "customizing-tokenizer", "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", - "Processing text": "processing-text", "NLP pipelines": "language-processing-pipeline", "Deep learning": "deep-learning", + "Production use": "production-use", "Training": "training", "Training NER": "training-ner", "Saving & loading": "saving-loading", @@ -99,11 +99,6 @@ "next": "training" }, - "processing-text": { - "title": "Processing text", - "next": "language-processing-pipeline" - }, - "language-processing-pipeline": { "title": "Language processing pipelines", "next": "deep-learning" @@ -111,9 +106,15 @@ "deep-learning": { "title": "Hooking a deep learning model into spaCy", + "next": "production use" + }, + + "production-use": { + "title": "Production use", "next": "training" }, + "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/production-use.jade similarity index 58% rename from website/docs/usage/processing-text.jade rename to website/docs/usage/production-use.jade index 2562d9fc4..68a313d8a 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/production-use.jade @@ -6,69 +6,6 @@ p | Once you have loaded the #[code nlp] object, you can call it as though | it were a function. This allows you to process a single unicode string. -+code. - doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') - -p - | The library should perform equally well with #[strong short or long documents]. - | All algorithms are linear-time in the length of the string, and once the - | data is loaded, there's no significant start-up cost to consider. This - | means that you don't have to strategically merge or split your text — - | you should feel free to feed in either single tweets or whole novels. - -p - | If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will - | load the #[+a("/docs/usage/models") model] associated with the name - | #[code 'en']. Each model is a Python package containing an - | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] - -the #[code nlp] object will - | be an instance of #[code spacy.en.English]. This means that when you run - | #[code doc = nlp(text)], you're executing - | #[code spacy.en.English.__call__], which is implemented on its parent - | class, #[+api("language") #[code Language]]. - -+code. - doc = nlp.make_doc(text) - for proc in nlp.pipeline: - proc(doc) - -p - | I've tried to make sure that the #[code Language.__call__] function - | doesn't do any "heavy lifting", so that you won't have complicated logic - | to replicate if you need to make your own pipeline class. This is all it - | does. - -p - | The #[code .make_doc()] method and #[code .pipeline] attribute make it - | easier to customise spaCy's behaviour. If you're using the default - | pipeline, we can desugar one more time. - -+code. - doc = nlp.tokenizer(text) - nlp.tagger(doc) - nlp.parser(doc) - nlp.entity(doc) - -p Finally, here's where you can find out about each of those components: - -+table(["Name", "Source"]) - +row - +cell #[code tokenizer] - +cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer] - - +row - +cell #[code tagger] - +cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger] - - +row - +cell #[code parser] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser] - - +row - +cell #[code entity] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer] - +h(2, "multithreading") Multi-threading with #[code .pipe()] p