From 63247cbe8776426382de789e55af75cbe0986fb4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 5 Jul 2020 16:11:16 +0200 Subject: [PATCH] Update v3 docs [ci skip] --- netlify.toml | 5 + website/docs/api/annotation.md | 621 ------------------ website/docs/api/cli.md | 12 +- website/docs/api/data-formats.md | 130 ++++ website/docs/api/goldparse.md | 206 ------ website/docs/api/top-level.md | 56 +- website/docs/models/index.md | 107 +-- website/docs/usage/101/_named-entities.md | 11 +- website/docs/usage/examples.md | 207 ------ website/docs/usage/index.md | 17 - website/docs/usage/linguistic-features.md | 62 +- website/docs/usage/processing-pipelines.md | 8 +- website/docs/usage/rule-based-matching.md | 7 +- website/docs/usage/spacy-101.md | 4 +- website/docs/usage/training.md | 45 +- website/docs/usage/v2-2.md | 4 +- website/docs/usage/v2.md | 6 +- ...rs-similarity.md => vectors-embeddings.md} | 19 +- website/docs/usage/visualizers.md | 7 +- website/meta/sidebars.json | 10 +- website/src/components/copy.js | 48 ++ website/src/components/icon.js | 2 + website/src/images/icons/clipboard.svg | 4 + website/src/styles/copy.module.sass | 21 + website/src/styles/infobox.module.sass | 6 +- website/src/templates/index.js | 2 + website/src/widgets/project.js | 32 + 27 files changed, 383 insertions(+), 1276 deletions(-) delete mode 100644 website/docs/api/annotation.md create mode 100644 website/docs/api/data-formats.md delete mode 100644 website/docs/api/goldparse.md delete mode 100644 website/docs/usage/examples.md rename website/docs/usage/{vectors-similarity.md => vectors-embeddings.md} (95%) create mode 100644 website/src/components/copy.js create mode 100644 website/src/images/icons/clipboard.svg create mode 100644 website/src/styles/copy.module.sass create mode 100644 website/src/widgets/project.js diff --git a/netlify.toml b/netlify.toml index b6e8df01f..6afa5ed7e 100644 --- a/netlify.toml +++ b/netlify.toml @@ -40,6 +40,11 @@ redirects = [ {from = "/tutorials", to = "/usage/examples", force = true}, # Old documentation pages (v2.x) {from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true}, + {from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true}, + {from = "/api/goldparse", to = "/api/top-level", force = true}, + {from = "/api/goldcorpus", to = "/api/corpus", force = true}, + {from = "/api/annotation", to = "/api/data-formats", force = true}, + {from = "/usage/examples", to = "/usage/projects", force = true}, # Rewrite all other docs pages to / {from = "/docs/*", to = "/:splat"}, # Updated documentation pages diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md deleted file mode 100644 index 5ca5e91d9..000000000 --- a/website/docs/api/annotation.md +++ /dev/null @@ -1,621 +0,0 @@ ---- -title: Annotation Specifications -teaser: Schemes used for labels, tags and training data -menu: - - ['Text Processing', 'text-processing'] - - ['POS Tagging', 'pos-tagging'] - - ['Dependencies', 'dependency-parsing'] - - ['Named Entities', 'named-entities'] - - ['Models & Training', 'training'] ---- - -## Text processing {#text-processing} - -> #### Example -> -> ```python -> from spacy.lang.en import English -> nlp = English() -> tokens = nlp("Some\\nspaces and\\ttab characters") -> tokens_text = [t.text for t in tokens] -> assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"] -> ``` - -Tokenization standards are based on the -[OntoNotes 5](https://catalog.ldc.upenn.edu/LDC2013T19) corpus. The tokenizer -differs from most by including **tokens for significant whitespace**. Any -sequence of whitespace characters beyond a single space (`' '`) is included as a -token. The whitespace tokens are useful for much the same reason punctuation is -– it's often an important delimiter in the text. By preserving it in the token -output, we are able to maintain a simple alignment between the tokens and the -original string, and we ensure that **no information is lost** during -processing. - -### Lemmatization {#lemmatization} - -> #### Examples -> -> In English, this means: -> -> - **Adjectives**: happier, happiest → happy -> - **Adverbs**: worse, worst → badly -> - **Nouns**: dogs, children → dog, child -> - **Verbs**: writes, writing, wrote, written → write - -As of v2.2, lemmatization data is stored in a separate package, -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) that can -be installed if needed via `pip install spacy[lookups]`. Some languages provide -full lemmatization rules and exceptions, while other languages currently only -rely on simple lookup tables. - - - -spaCy adds a **special case for English pronouns**: all English pronouns are -lemmatized to the special token `-PRON-`. Unlike verbs and common nouns, -there's no clear base form of a personal pronoun. Should the lemma of "me" be -"I", or should we normalize person as well, giving "it" — or maybe "he"? -spaCy's solution is to introduce a novel symbol, `-PRON-`, which is used as the -lemma for all personal pronouns. - - - -### Sentence boundary detection {#sentence-boundary} - -Sentence boundaries are calculated from the syntactic parse tree, so features -such as punctuation and capitalization play an important but non-decisive role -in determining the sentence boundaries. Usually this means that the sentence -boundaries will at least coincide with clause boundaries, even given poorly -punctuated text. - -## Part-of-speech tagging {#pos-tagging} - -> #### Tip: Understanding tags -> -> You can also use `spacy.explain` to get the description for the string -> representation of a tag. For example, `spacy.explain("RB")` will return -> "adverb". - -This section lists the fine-grained and coarse-grained part-of-speech tags -assigned by spaCy's [models](/models). The individual mapping is specific to the -training corpus and can be defined in the respective language data's -[`tag_map.py`](/usage/adding-languages#tag-map). - - - -spaCy maps all language-specific part-of-speech tags to a small, fixed set of -word type tags following the -[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The -universal tags don't code for any morphological features and only cover the word -type. They're available as the [`Token.pos`](/api/token#attributes) and -[`Token.pos_`](/api/token#attributes) attributes. - -| POS | Description | Examples | -| ------- | ------------------------- | --------------------------------------------- | -| `ADJ` | adjective | big, old, green, incomprehensible, first | -| `ADP` | adposition | in, to, during | -| `ADV` | adverb | very, tomorrow, down, where, there | -| `AUX` | auxiliary | is, has (done), will (do), should (do) | -| `CONJ` | conjunction | and, or, but | -| `CCONJ` | coordinating conjunction | and, or, but | -| `DET` | determiner | a, an, the | -| `INTJ` | interjection | psst, ouch, bravo, hello | -| `NOUN` | noun | girl, cat, tree, air, beauty | -| `NUM` | numeral | 1, 2017, one, seventy-seven, IV, MMXIV | -| `PART` | particle | 's, not, | -| `PRON` | pronoun | I, you, he, she, myself, themselves, somebody | -| `PROPN` | proper noun | Mary, John, London, NATO, HBO | -| `PUNCT` | punctuation | ., (, ), ? | -| `SCONJ` | subordinating conjunction | if, while, that | -| `SYM` | symbol | \$, %, §, ©, +, −, ×, ÷, =, :), 😝 | -| `VERB` | verb | run, runs, running, eat, ate, eating | -| `X` | other | sfpksdpsxmsa | -| `SPACE` | space | - - - - - -The English part-of-speech tagger uses the -[OntoNotes 5](https://catalog.ldc.upenn.edu/LDC2013T19) version of the Penn -Treebank tag set. We also map the tags to the simpler Universal Dependencies v2 -POS tag set. - -| Tag |  POS | Morphology | Description | -| ------------------------------------- | ------- | --------------------------------------- | ----------------------------------------- | -| `$` | `SYM` | | symbol, currency | -| `` | `PUNCT` | `PunctType=quot PunctSide=ini` | opening quotation mark | -| `''` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark | -| `,` | `PUNCT` | `PunctType=comm` | punctuation mark, comma | -| `-LRB-` | `PUNCT` | `PunctType=brck PunctSide=ini` | left round bracket | -| `-RRB-` | `PUNCT` | `PunctType=brck PunctSide=fin` | right round bracket | -| `.` | `PUNCT` | `PunctType=peri` | punctuation mark, sentence closer | -| `:` | `PUNCT` | | punctuation mark, colon or ellipsis | -| `ADD` | `X` | | email | -| `AFX` | `ADJ` | `Hyph=yes` | affix | -| `CC` | `CCONJ` | `ConjType=comp` | conjunction, coordinating | -| `CD` | `NUM` | `NumType=card` | cardinal number | -| `DT` | `DET` | | determiner | -| `EX` | `PRON` | `AdvType=ex` | existential there | -| `FW` | `X` | `Foreign=yes` | foreign word | -| `GW` | `X` | | additional word in multi-word expression | -| `HYPH` | `PUNCT` | `PunctType=dash` | punctuation mark, hyphen | -| `IN` | `ADP` | | conjunction, subordinating or preposition | -| `JJ` | `ADJ` | `Degree=pos` | adjective | -| `JJR` | `ADJ` | `Degree=comp` | adjective, comparative | -| `JJS` | `ADJ` | `Degree=sup` | adjective, superlative | -| `LS` | `X` | `NumType=ord` | list item marker | -| `MD` | `VERB` | `VerbType=mod` | verb, modal auxiliary | -| `NFP` | `PUNCT` | | superfluous punctuation | -| `NIL` | `X` | | missing tag | -| `NN` | `NOUN` | `Number=sing` | noun, singular or mass | -| `NNP` | `PROPN` | `NounType=prop Number=sing` | noun, proper singular | -| `NNPS` | `PROPN` | `NounType=prop Number=plur` | noun, proper plural | -| `NNS` | `NOUN` | `Number=plur` | noun, plural | -| `PDT` | `DET` | | predeterminer | -| `POS` | `PART` | `Poss=yes` | possessive ending | -| `PRP` | `PRON` | `PronType=prs` | pronoun, personal | -| `PRP$` | `DET` | `PronType=prs Poss=yes` | pronoun, possessive | -| `RB` | `ADV` | `Degree=pos` | adverb | -| `RBR` | `ADV` | `Degree=comp` | adverb, comparative | -| `RBS` | `ADV` | `Degree=sup` | adverb, superlative | -| `RP` | `ADP` | | adverb, particle | -| `SP` | `SPACE` | | space | -| `SYM` | `SYM` | | symbol | -| `TO` | `PART` | `PartType=inf VerbForm=inf` | infinitival "to" | -| `UH` | `INTJ` | | interjection | -| `VB` | `VERB` | `VerbForm=inf` | verb, base form | -| `VBD` | `VERB` | `VerbForm=fin Tense=past` | verb, past tense | -| `VBG` | `VERB` | `VerbForm=part Tense=pres Aspect=prog` | verb, gerund or present participle | -| `VBN` | `VERB` | `VerbForm=part Tense=past Aspect=perf` | verb, past participle | -| `VBP` | `VERB` | `VerbForm=fin Tense=pres` | verb, non-3rd person singular present | -| `VBZ` | `VERB` | `VerbForm=fin Tense=pres Number=sing Person=three` | verb, 3rd person singular present | -| `WDT` | `DET` | | wh-determiner | -| `WP` | `PRON` | | wh-pronoun, personal | -| `WP$` | `DET` | `Poss=yes` | wh-pronoun, possessive | -| `WRB` | `ADV` | | wh-adverb | -| `XX` | `X` | | unknown | -| `_SP` | `SPACE` | | | - - - - -The German part-of-speech tagger uses the -[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html) -annotation scheme. We also map the tags to the simpler Universal Dependencies -v2 POS tag set. - -| Tag |  POS | Morphology | Description | -| --------- | ------- | ---------------------------------------- | ------------------------------------------------- | -| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | -| `$,` | `PUNCT` | `PunctType=comm` | comma | -| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | -| `ADJA` | `ADJ` | | adjective, attributive | -| `ADJD` | `ADJ` | | adjective, adverbial or predicative | -| `ADV` | `ADV` | | adverb | -| `APPO` | `ADP` | `AdpType=post` | postposition | -| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | -| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | -| `APZR` | `ADP` | `AdpType=circ` | circumposition right | -| `ART` | `DET` | `PronType=art` | definite or indefinite article | -| `CARD` | `NUM` | `NumType=card` | cardinal number | -| `FM` | `X` | `Foreign=yes` | foreign language material | -| `ITJ` | `INTJ` | | interjection | -| `KOKOM` | `CCONJ` | `ConjType=comp` | comparative conjunction | -| `KON` | `CCONJ` | | coordinate conjunction | -| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | -| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | -| `NE` | `PROPN` | | proper noun | -| `NN` | `NOUN` | | noun, singular or mass | -| `NNE` | `PROPN` | | proper noun | -| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | -| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | -| `PIAT` | `DET` | `PronType=ind|neg|tot` | attributive indefinite pronoun without determiner | -| `PIS` | `PRON` | `PronType=ind|neg|tot` | substituting indefinite pronoun | -| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | -| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | -| `PPOSS` | `PRON` | `Poss=yes PronType=prs` | substituting possessive pronoun | -| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | -| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | -| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | -| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | -| `PTKA` | `PART` | | particle with adjective or adverb | -| `PTKANT` | `PART` | `PartType=res` | answer particle | -| `PTKNEG` | `PART` | `Polarity=neg` | negative particle | -| `PTKVZ` | `ADP` | `PartType=vbp` | separable verbal particle | -| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | -| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | -| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | -| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | -| `TRUNC` | `X` | `Hyph=yes` | word remnant | -| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | -| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | -| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | -| `VAPP` | `AUX` | `Aspect=perf VerbForm=part` | perfect participle, auxiliary | -| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | -| `VMINF` | `VERB` | `VerbForm=inf VerbType=mod` | infinitive, modal | -| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | -| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | -| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | -| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | -| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | -| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | -| `XY` | `X` | | non-word containing non-letter | -| `_SP` | `SPACE` | | | - - ---- - - - -For the label schemes used by the other models, see the respective `tag_map.py` -in [`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). - - - -## Syntactic Dependency Parsing {#dependency-parsing} - -> #### Tip: Understanding labels -> -> You can also use `spacy.explain` to get the description for the string -> representation of a label. For example, `spacy.explain("prt")` will return -> "particle". - -This section lists the syntactic dependency labels assigned by spaCy's -[models](/models). The individual labels are language-specific and depend on the -training corpus. - - - -The [Universal Dependencies scheme](http://universaldependencies.org/u/dep/) is -used in all languages trained on Universal Dependency Corpora. - -| Label | Description | -| ------------ | -------------------------------------------- | -| `acl` | clausal modifier of noun (adjectival clause) | -| `advcl` | adverbial clause modifier | -| `advmod` | adverbial modifier | -| `amod` | adjectival modifier | -| `appos` | appositional modifier | -| `aux` | auxiliary | -| `case` | case marking | -| `cc` | coordinating conjunction | -| `ccomp` | clausal complement | -| `clf` | classifier | -| `compound` | compound | -| `conj` | conjunct | -| `cop` | copula | -| `csubj` | clausal subject | -| `dep` | unspecified dependency | -| `det` | determiner | -| `discourse` | discourse element | -| `dislocated` | dislocated elements | -| `expl` | expletive | -| `fixed` | fixed multiword expression | -| `flat` | flat multiword expression | -| `goeswith` | goes with | -| `iobj` | indirect object | -| `list` | list | -| `mark` | marker | -| `nmod` | nominal modifier | -| `nsubj` | nominal subject | -| `nummod` | numeric modifier | -| `obj` | object | -| `obl` | oblique nominal | -| `orphan` | orphan | -| `parataxis` | parataxis | -| `punct` | punctuation | -| `reparandum` | overridden disfluency | -| `root` | root | -| `vocative` | vocative | -| `xcomp` | open clausal complement | - - - - - -The English dependency labels use the -[CLEAR Style](https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md) -by [ClearNLP](http://www.clearnlp.com). - -| Label | Description | -| ----------- | -------------------------------------------- | -| `acl` | clausal modifier of noun (adjectival clause) | -| `acomp` | adjectival complement | -| `advcl` | adverbial clause modifier | -| `advmod` | adverbial modifier | -| `agent` | agent | -| `amod` | adjectival modifier | -| `appos` | appositional modifier | -| `attr` | attribute | -| `aux` | auxiliary | -| `auxpass` | auxiliary (passive) | -| `case` | case marking | -| `cc` | coordinating conjunction | -| `ccomp` | clausal complement | -| `compound` | compound | -| `conj` | conjunct | -| `cop` | copula | -| `csubj` | clausal subject | -| `csubjpass` | clausal subject (passive) | -| `dative` | dative | -| `dep` | unclassified dependent | -| `det` | determiner | -| `dobj` | direct object | -| `expl` | expletive | -| `intj` | interjection | -| `mark` | marker | -| `meta` | meta modifier | -| `neg` | negation modifier | -| `nn` | noun compound modifier | -| `nounmod` | modifier of nominal | -| `npmod` | noun phrase as adverbial modifier | -| `nsubj` | nominal subject | -| `nsubjpass` | nominal subject (passive) | -| `nummod` | numeric modifier | -| `oprd` | object predicate | -| `obj` | object | -| `obl` | oblique nominal | -| `parataxis` | parataxis | -| `pcomp` | complement of preposition | -| `pobj` | object of preposition | -| `poss` | possession modifier | -| `preconj` | pre-correlative conjunction | -| `prep` | prepositional modifier | -| `prt` | particle | -| `punct` | punctuation | -| `quantmod` | modifier of quantifier | -| `relcl` | relative clause modifier | -| `root` | root | -| `xcomp` | open clausal complement | - - - - - -The German dependency labels use the -[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html) -annotation scheme. - -| Label | Description | -| ------- | ------------------------------- | -| `ac` | adpositional case marker | -| `adc` | adjective component | -| `ag` | genitive attribute | -| `ams` | measure argument of adjective | -| `app` | apposition | -| `avc` | adverbial phrase component | -| `cc` | comparative complement | -| `cd` | coordinating conjunction | -| `cj` | conjunct | -| `cm` | comparative conjunction | -| `cp` | complementizer | -| `cvc` | collocational verb construction | -| `da` | dative | -| `dm` | discourse marker | -| `ep` | expletive es | -| `ju` | junctor | -| `mnr` | postnominal modifier | -| `mo` | modifier | -| `ng` | negation | -| `nk` | noun kernel element | -| `nmc` | numerical component | -| `oa` | accusative object | -| `oa2` | second accusative object | -| `oc` | clausal object | -| `og` | genitive object | -| `op` | prepositional object | -| `par` | parenthetical element | -| `pd` | predicate | -| `pg` | phrasal genitive | -| `ph` | placeholder | -| `pm` | morphological particle | -| `pnc` | proper noun component | -| `punct` | punctuation | -| `rc` | relative clause | -| `re` | repeated element | -| `rs` | reported speech | -| `sb` | subject | -| `sbp` | passivized subject (PP) | -| `sp` | subject or predicate | -| `svp` | separable verb prefix | -| `uc` | unit component | -| `vo` | vocative | -| `ROOT` | root | - - - -## Named Entity Recognition {#named-entities} - -> #### Tip: Understanding entity types -> -> You can also use `spacy.explain` to get the description for the string -> representation of an entity label. For example, `spacy.explain("LANGUAGE")` -> will return "any named language". - -Models trained on the [OntoNotes 5](https://catalog.ldc.upenn.edu/LDC2013T19) -corpus support the following entity types: - -| Type | Description | -| ------------- | ---------------------------------------------------- | -| `PERSON` | People, including fictional. | -| `NORP` | Nationalities or religious or political groups. | -| `FAC` | Buildings, airports, highways, bridges, etc. | -| `ORG` | Companies, agencies, institutions, etc. | -| `GPE` | Countries, cities, states. | -| `LOC` | Non-GPE locations, mountain ranges, bodies of water. | -| `PRODUCT` | Objects, vehicles, foods, etc. (Not services.) | -| `EVENT` | Named hurricanes, battles, wars, sports events, etc. | -| `WORK_OF_ART` | Titles of books, songs, etc. | -| `LAW` | Named documents made into laws. | -| `LANGUAGE` | Any named language. | -| `DATE` | Absolute or relative dates or periods. | -| `TIME` | Times smaller than a day. | -| `PERCENT` | Percentage, including "%". | -| `MONEY` | Monetary values, including unit. | -| `QUANTITY` | Measurements, as of weight or distance. | -| `ORDINAL` | "first", "second", etc. | -| `CARDINAL` | Numerals that do not fall under another type. | - -### Wikipedia scheme {#ner-wikipedia-scheme} - -Models trained on Wikipedia corpus -([Nothman et al., 2013](http://www.sciencedirect.com/science/article/pii/S0004370212000276)) -use a less fine-grained NER annotation scheme and recognise the following -entities: - -| Type | Description | -| ------ | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `PER` | Named person or family. | -| `LOC` | Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains). | -| `ORG` | Named corporate, governmental, or other organizational entity. | -| `MISC` | Miscellaneous entities, e.g. events, nationalities, products or works of art. | - -### IOB Scheme {#iob} - -| Tag | ID | Description | -| ----- | --- | ------------------------------------- | -| `"I"` | `1` | Token is inside an entity. | -| `"O"` | `2` | Token is outside an entity. | -| `"B"` | `3` | Token begins an entity. | -| `""` | `0` | No entity tag is set (missing value). | - -### BILUO Scheme {#biluo} - -| Tag | Description | -| ----------- | ---------------------------------------- | -| **`B`**EGIN | The first token of a multi-token entity. | -| **`I`**N | An inner token of a multi-token entity. | -| **`L`**AST | The final token of a multi-token entity. | -| **`U`**NIT | A single-token entity. | -| **`O`**UT | A non-entity token. | - -> #### Why BILUO, not IOB? -> -> There are several coding schemes for encoding entity annotations as token -> tags. These coding schemes are equally expressive, but not necessarily equally -> learnable. [Ratinov and Roth](http://www.aclweb.org/anthology/W09-1119) showed -> that the minimal **Begin**, **In**, **Out** scheme was more difficult to learn -> than the **BILUO** scheme that we use, which explicitly marks boundary tokens. - -spaCy translates the character offsets into this scheme, in order to decide the -cost of each action given the current state of the entity recognizer. The costs -are then used to calculate the gradient of the loss, to train the model. The -exact algorithm is a pastiche of well-known methods, and is not currently -described in any single publication. The model is a greedy transition-based -parser guided by a linear model whose weights are learned using the averaged -perceptron loss, via the -[dynamic oracle](http://www.aclweb.org/anthology/C12-1059) imitation learning -strategy. The transition system is equivalent to the BILUO tagging scheme. - -## Models and training data {#training} - -### JSON input format for training {#json-input} - -spaCy takes training data in JSON format. The built-in -[`convert`](/api/cli#convert) command helps you convert the `.conllu` format -used by the -[Universal Dependencies corpora](https://github.com/UniversalDependencies) to -spaCy's training format. To convert one or more existing `Doc` objects to -spaCy's JSON format, you can use the -[`gold.docs_to_json`](/api/goldparse#docs_to_json) helper. - -> #### Annotating entities -> -> Named entities are provided in the [BILUO](#biluo) notation. Tokens outside an -> entity are set to `"O"` and tokens that are part of an entity are set to the -> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes -> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token -> representing a `PERSON` entity. The -> [`biluo_tags_from_offsets`](/api/goldparse#biluo_tags_from_offsets) function -> can help you convert entity offsets to the right format. - -```python -### Example structure -[{ - "id": int, # ID of the document within the corpus - "paragraphs": [{ # list of paragraphs in the corpus - "raw": string, # raw text of the paragraph - "sentences": [{ # list of sentences in the paragraph - "tokens": [{ # list of tokens in the sentence - "id": int, # index of the token in the document - "dep": string, # dependency label - "head": int, # offset of token head relative to token index - "tag": string, # part-of-speech tag - "orth": string, # verbatim text of the token - "ner": string # BILUO label, e.g. "O" or "B-ORG" - }], - "brackets": [{ # phrase structure (NOT USED by current models) - "first": int, # index of first token - "last": int, # index of last token - "label": string # phrase label - }] - }], - "cats": [{ # new in v2.2: categories for text classifier - "label": string, # text category label - "value": float / bool # label applies (1.0/true) or not (0.0/false) - }] - }] -}] -``` - -Here's an example of dependencies, part-of-speech tags and names entities, taken -from the English Wall Street Journal portion of the Penn Treebank: - -```json -https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json -``` - -### Lexical data for vocabulary {#vocab-jsonl new="2"} - -To populate a model's vocabulary, you can use the -[`spacy init-model`](/api/cli#init-model) command and load in a -[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one -lexical entry per line via the `--jsonl-loc` option. The first line defines the -language and vocabulary settings. All other lines are expected to be JSON -objects describing an individual lexeme. The lexical attributes will be then set -as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab` -command outputs a ready-to-use spaCy model with a `Vocab` containing the lexical -data. - -```python -### First line -{"lang": "en", "settings": {"oov_prob": -20.502029418945312}} -``` - -```python -### Entry structure -{ - "orth": string, # the word text - "id": int, # can correspond to row in vectors table - "lower": string, - "norm": string, - "shape": string - "prefix": string, - "suffix": string, - "length": int, - "cluster": string, - "prob": float, - "is_alpha": bool, - "is_ascii": bool, - "is_digit": bool, - "is_lower": bool, - "is_punct": bool, - "is_space": bool, - "is_title": bool, - "is_upper": bool, - "like_url": bool, - "like_num": bool, - "like_email": bool, - "is_stop": bool, - "is_oov": bool, - "is_quote": bool, - "is_left_punct": bool, - "is_right_punct": bool -} -``` - -Here's an example of the 20 most frequent lexemes in the English training data: - -```json -https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl -``` diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 8dccad165..e6036d5be 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -97,7 +97,7 @@ $ python -m spacy validate ## Convert {#convert} Convert files into spaCy's -[binary training data format](/usage/training#data-format), a serialized +[binary training data format](/api/data-formats#binary-training), a serialized [`DocBin`](/api/docbin), for use with the `train` command and other experiment management functions. The converter can be specified on the command line, or chosen based on the file extension of the input file. @@ -128,7 +128,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] | ID | Description | | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `auto` | Automatically pick converter based on file extension and file content (default). | -| `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/goldparse#docs_to_json). | +| `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/top-level#docs_to_json). | | `conll` | Universal Dependencies `.conllu` or `.conll` format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | @@ -300,8 +300,8 @@ will not be available. Train a model. Expects data in spaCy's -[JSON format](/api/annotation#json-input). On each epoch, a model will be saved -out to the directory. Accuracy scores and model details will be added to a +[JSON format](/api/data-formats#json-input). On each epoch, a model will be +saved out to the directory. Accuracy scores and model details will be added to a [`meta.json`](/usage/training#models-generating) to allow packaging the model using the [`package`](/api/cli#package) command. @@ -438,7 +438,7 @@ tokenization can be provided. Create a new model directory from raw data, like word frequencies, Brown clusters and word vectors. This command is similar to the `spacy model` command in v1.x. Note that in order to populate the model's vocab, you need to pass in a -JSONL-formatted [vocabulary file](<(/api/annotation#vocab-jsonl)>) as +JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional `id` values that correspond to the vectors table. Just loading in vectors will not automatically populate the vocab. @@ -451,7 +451,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] | ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | | `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. | | `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | | `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | | `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md new file mode 100644 index 000000000..5b122a2e2 --- /dev/null +++ b/website/docs/api/data-formats.md @@ -0,0 +1,130 @@ +--- +title: Data formats +teaser: Details on spaCy's input and output data formats +menu: + - ['Training data', 'training'] + - ['Vocabulary', 'vocab'] +--- + +This section documents input and output formats of data used by spaCy, including +training data and lexical vocabulary data. For an overview of label schemes used +by the models, see the [models directory](/models). Each model documents the +label schemes used in its components, depending on the data it was trained on. + +## Training data {#training} + +### Binary training format {#binary-training new="3"} + + + +### JSON input format for training {#json-input} + +spaCy takes training data in JSON format. The built-in +[`convert`](/api/cli#convert) command helps you convert the `.conllu` format +used by the +[Universal Dependencies corpora](https://github.com/UniversalDependencies) to +spaCy's training format. To convert one or more existing `Doc` objects to +spaCy's JSON format, you can use the +[`gold.docs_to_json`](/api/top-level#docs_to_json) helper. + +> #### Annotating entities +> +> Named entities are provided in the +> [BILUO](/usage/linguistic-features#accessing-ner) notation. Tokens outside an +> entity are set to `"O"` and tokens that are part of an entity are set to the +> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes +> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token +> representing a `PERSON` entity. The +> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function +> can help you convert entity offsets to the right format. + +```python +### Example structure +[{ + "id": int, # ID of the document within the corpus + "paragraphs": [{ # list of paragraphs in the corpus + "raw": string, # raw text of the paragraph + "sentences": [{ # list of sentences in the paragraph + "tokens": [{ # list of tokens in the sentence + "id": int, # index of the token in the document + "dep": string, # dependency label + "head": int, # offset of token head relative to token index + "tag": string, # part-of-speech tag + "orth": string, # verbatim text of the token + "ner": string # BILUO label, e.g. "O" or "B-ORG" + }], + "brackets": [{ # phrase structure (NOT USED by current models) + "first": int, # index of first token + "last": int, # index of last token + "label": string # phrase label + }] + }], + "cats": [{ # new in v2.2: categories for text classifier + "label": string, # text category label + "value": float / bool # label applies (1.0/true) or not (0.0/false) + }] + }] +}] +``` + +Here's an example of dependencies, part-of-speech tags and names entities, taken +from the English Wall Street Journal portion of the Penn Treebank: + +```json +https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json +``` + +## Lexical data for vocabulary {#vocab-jsonl new="2"} + +To populate a model's vocabulary, you can use the +[`spacy init-model`](/api/cli#init-model) command and load in a +[newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one +lexical entry per line via the `--jsonl-loc` option. The first line defines the +language and vocabulary settings. All other lines are expected to be JSON +objects describing an individual lexeme. The lexical attributes will be then set +as attributes on spaCy's [`Lexeme`](/api/lexeme#attributes) object. The `vocab` +command outputs a ready-to-use spaCy model with a `Vocab` containing the lexical +data. + +```python +### First line +{"lang": "en", "settings": {"oov_prob": -20.502029418945312}} +``` + +```python +### Entry structure +{ + "orth": string, # the word text + "id": int, # can correspond to row in vectors table + "lower": string, + "norm": string, + "shape": string + "prefix": string, + "suffix": string, + "length": int, + "cluster": string, + "prob": float, + "is_alpha": bool, + "is_ascii": bool, + "is_digit": bool, + "is_lower": bool, + "is_punct": bool, + "is_space": bool, + "is_title": bool, + "is_upper": bool, + "like_url": bool, + "like_num": bool, + "like_email": bool, + "is_stop": bool, + "is_oov": bool, + "is_quote": bool, + "is_left_punct": bool, + "is_right_punct": bool +} +``` + +Here's an example of the 20 most frequent lexemes in the English training data: + +```json +https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl +``` diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md deleted file mode 100644 index 85b62e074..000000000 --- a/website/docs/api/goldparse.md +++ /dev/null @@ -1,206 +0,0 @@ ---- -title: GoldParse -teaser: A collection for training annotations -tag: class -source: spacy/gold.pyx ---- - -## GoldParse.\_\_init\_\_ {#init tag="method"} - -Create a `GoldParse`. The [`TextCategorizer`](/api/textcategorizer) component -expects true examples of a label to have the value `1.0`, and negative examples -of a label to have the value `0.0`. Labels not in the dictionary are treated as -missing – the gradient for those labels will be zero. - -| Name | Type | Description | -| ----------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document the annotations refer to. | -| `words` | iterable | A sequence of word strings. | -| `tags` | iterable | A sequence of strings, representing tag annotations. | -| `heads` | iterable | A sequence of integers, representing syntactic head offsets. | -| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | -| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | -| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | -| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False`. | -| **RETURNS** | `GoldParse` | The newly constructed object. | - -## GoldParse.\_\_len\_\_ {#len tag="method"} - -Get the number of gold-standard tokens. - -| Name | Type | Description | -| ----------- | ---- | ----------------------------------- | -| **RETURNS** | int | The number of gold-standard tokens. | - -## GoldParse.is_projective {#is_projective tag="property"} - -Whether the provided syntactic annotations form a projective dependency tree. - -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------- | -| **RETURNS** | bool | Whether annotations form projective tree. | - -## Attributes {#attributes} - -| Name | Type | Description | -| ------------------------------------ | ---- | ------------------------------------------------------------------------------------------------------------------------ | -| `words` | list | The words. | -| `tags` | list | The part-of-speech tag annotations. | -| `heads` | list | The syntactic head annotations. | -| `labels` | list | The syntactic relation-type annotations. | -| `ner` | list | The named entity annotations as BILUO tags. | -| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | -| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | -| `cats` 2 | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. | -| `links` 2.2 | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. | - -## Utilities {#util} - -### gold.docs_to_json {#docs_to_json tag="function"} - -Convert a list of Doc objects into the -[JSON-serializable format](/api/annotation#json-input) used by the -[`spacy train`](/api/cli#train) command. Each input doc will be treated as a -'paragraph' in the output doc. - -> #### Example -> -> ```python -> from spacy.gold import docs_to_json -> -> doc = nlp("I like London") -> json_data = docs_to_json([doc]) -> ``` - -| Name | Type | Description | -| ----------- | ---------------- | ------------------------------------------ | -| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. | -| `id` | int | ID to assign to the JSON. Defaults to `0`. | -| **RETURNS** | dict | The data in spaCy's JSON format. | - -### gold.align {#align tag="function"} - -Calculate alignment tables between two tokenizations, using the Levenshtein -algorithm. The alignment is case-insensitive. - - - -The current implementation of the alignment algorithm assumes that both -tokenizations add up to the same string. For example, you'll be able to align -`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not -`["I", "'m"]` and `["I", "am"]`. - - - -> #### Example -> -> ```python -> from spacy.gold import align -> -> bert_tokens = ["obama", "'", "s", "podcast"] -> spacy_tokens = ["obama", "'s", "podcast"] -> alignment = align(bert_tokens, spacy_tokens) -> cost, a2b, b2a, a2b_multi, b2a_multi = alignment -> ``` - -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------- | -| `tokens_a` | list | String values of candidate tokens to align. | -| `tokens_b` | list | String values of reference tokens to align. | -| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | - -The returned tuple contains the following alignment information: - -> #### Example -> -> ```python -> a2b = array([0, -1, -1, 2]) -> b2a = array([0, 2, 3]) -> a2b_multi = {1: 1, 2: 1} -> b2a_multi = {} -> ``` -> -> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If -> there's no one-to-one alignment for a token, it has the value `-1`. - -| Name | Type | Description | -| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `cost` | int | The number of misaligned tokens. | -| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. | -| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. | -| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | -| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | - -### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} - -Encode labelled spans into per-token tags, using the -[BILUO scheme](/api/annotation#biluo) (Begin, In, Last, Unit, Out). Returns a -list of strings, describing the tags. Each tag string will be of the form of -either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, -`"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with -the tokenization in the `Doc` object. The training algorithm will view these as -missing values. `O` denotes a non-entity token. `B` denotes the beginning of a -multi-token entity, `I` the inside of an entity of three or more tokens, and `L` -the end of an entity of two or more tokens. `U` denotes a single-token entity. - -> #### Example -> -> ```python -> from spacy.gold import biluo_tags_from_offsets -> -> doc = nlp("I like London.") -> entities = [(7, 13, "LOC")] -> tags = biluo_tags_from_offsets(doc, entities) -> assert tags == ["O", "O", "U-LOC", "O"] -> ``` - -| Name | Type | Description | -| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | -| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | -| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | - -### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} - -Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into -entity offsets. - -> #### Example -> -> ```python -> from spacy.gold import offsets_from_biluo_tags -> -> doc = nlp("I like London.") -> tags = ["O", "O", "U-LOC", "O"] -> entities = offsets_from_biluo_tags(doc, tags) -> assert entities == [(7, 13, "LOC")] -> ``` - -| Name | Type | Description | -| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the BILUO tags refer to. | -| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | -| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | - -### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} - -Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into -[`Span`](/api/span) objects. This can be used to create entity spans from -token-based tags, e.g. to overwrite the `doc.ents`. - -> #### Example -> -> ```python -> from spacy.gold import spans_from_biluo_tags -> -> doc = nlp("I like London.") -> tags = ["O", "O", "U-LOC", "O"] -> doc.ents = spans_from_biluo_tags(doc, tags) -> ``` - -| Name | Type | Description | -| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the BILUO tags refer to. | -| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | -| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 5f7130038..9094b46d3 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -252,10 +252,10 @@ If a setting is not present in the options, the default value will be used. | `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | -By default, displaCy comes with colors for all -[entity types supported by spaCy](/api/annotation#named-entities). If you're -using custom entity types, you can use the `colors` setting to add your own -colors for them. Your application or model package can also expose a +By default, displaCy comes with colors for all entity types used by +[spaCy models](/models). If you're using custom entity types, you can use the +`colors` setting to add your own colors for them. Your application or model +package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. @@ -264,7 +264,7 @@ to add custom labels and their colors automatically. ### gold.docs_to_json {#docs_to_json tag="function"} Convert a list of Doc objects into the -[JSON-serializable format](/api/annotation#json-input) used by the +[JSON-serializable format](/api/data-formats#json-input) used by the [`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. @@ -339,14 +339,15 @@ The returned tuple contains the following alignment information: ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the -[BILUO scheme](/api/annotation#biluo) (Begin, In, Last, Unit, Out). Returns a -list of strings, describing the tags. Each tag string will be of the form of -either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, -`"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with -the tokenization in the `Doc` object. The training algorithm will view these as -missing values. `O` denotes a non-entity token. `B` denotes the beginning of a -multi-token entity, `I` the inside of an entity of three or more tokens, and `L` -the end of an entity of two or more tokens. `U` denotes a single-token entity. +[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, +Out). Returns a list of strings, describing the tags. Each tag string will be of +the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of +`"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets +don't align with the tokenization in the `Doc` object. The training algorithm +will view these as missing values. `O` denotes a non-entity token. `B` denotes +the beginning of a multi-token entity, `I` the inside of an entity of three or +more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a +single-token entity. > #### Example > @@ -363,12 +364,12 @@ the end of an entity of two or more tokens. `U` denotes a single-token entity. | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | | `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | -| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | +| **RETURNS** | list | str strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. | ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} -Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into -entity offsets. +Encode per-token tags following the +[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. > #### Example > @@ -381,15 +382,16 @@ entity offsets. > assert entities == [(7, 13, "LOC")] > ``` -| Name | Type | Description | -| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the BILUO tags refer to. | -| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | -| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | +| Name | Type | Description | +| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the BILUO tags refer to. | +| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | +| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | ### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} -Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into +Encode per-token tags following the +[BILUO scheme](/usage/linguistic-features#accessing-ner) into [`Span`](/api/span) objects. This can be used to create entity spans from token-based tags, e.g. to overwrite the `doc.ents`. @@ -403,11 +405,11 @@ token-based tags, e.g. to overwrite the `doc.ents`. > doc.ents = spans_from_biluo_tags(doc, tags) > ``` -| Name | Type | Description | -| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the BILUO tags refer to. | -| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | -| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | +| Name | Type | Description | +| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the BILUO tags refer to. | +| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | +| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | ## Utility functions {#util source="spacy/util.py"} diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 31bc3c549..10910b93b 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -7,6 +7,8 @@ menu: - ['Conventions', 'conventions'] --- + + The models directory includes two types of pretrained models: 1. **Core models:** General-purpose pretrained models to predict named entities, @@ -32,111 +34,6 @@ For more details on how to use models with spaCy, see the -## Model architecture {#architecture} - -spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity -recognition**. The models have been designed and implemented from scratch -specifically for spaCy, to give you an unmatched balance of speed, size and -accuracy. A novel bloom embedding strategy with subword features is used to -support huge vocabularies in tiny tables. Convolutional layers with residual -connections, layer normalization and maxout non-linearity are used, giving much -better efficiency than the standard BiLSTM solution. - -The parser and NER use an imitation learning objective to deliver **accuracy -in-line with the latest research systems**, even when evaluated from raw text. -With these innovations, spaCy v2.0's models are **10× smaller**, **20% more -accurate**, and **even cheaper to run** than the previous generation. The -current architecture hasn't been published yet, but in the meantime we prepared -a video that explains how the models work, with particular focus on NER. - - - -The parsing model is a blend of recent results. The two recent inspirations have -been the work of Eli Klipperwasser and Yoav Goldberg at Bar Ilan[^1], and the -SyntaxNet team from Google. The foundation of the parser is still based on the -work of Joakim Nivre[^2], who introduced the transition-based framework[^3], the -arc-eager transition system, and the imitation learning objective. The model is -implemented using [Thinc](https://github.com/explosion/thinc), spaCy's machine -learning library. We first predict context-sensitive vectors for each word in -the input: - -```python -(embed_lower | embed_prefix | embed_suffix | embed_shape) - >> Maxout(token_width) - >> convolution ** 4 -``` - -This convolutional layer is shared between the tagger, parser and NER, and will -also be shared by the future neural lemmatizer. Because the parser shares these -layers with the tagger, the parser does not require tag features. I got this -trick from David Weiss's "Stack Combination" paper[^4]. - -To boost the representation, the tagger actually predicts a "super tag" with -POS, morphology and dependency label[^5]. The tagger predicts these supertags by -adding a softmax layer onto the convolutional layer – so, we're teaching the -convolutional layer to give us a representation that's one affine transform from -this informative lexical information. This is obviously good for the parser -(which backprops to the convolutions, too). The parser model makes a state -vector by concatenating the vector representations for its context tokens. The -current context tokens: - -| Context tokens | Description | -| ---------------------------------------------------------------------------------- | --------------------------------------------------------------------------- | -| `S0`, `S1`, `S2` | Top three words on the stack. | -| `B0`, `B1` | First two words of the buffer. | -| `S0L1`, `S1L1`, `S2L1`, `B0L1`, `B1L1`
`S0L2`, `S1L2`, `S2L2`, `B0L2`, `B1L2` | Leftmost and second leftmost children of `S0`, `S1`, `S2`, `B0` and `B1`. | -| `S0R1`, `S1R1`, `S2R1`, `B0R1`, `B1R1`
`S0R2`, `S1R2`, `S2R2`, `B0R2`, `B1R2` | Rightmost and second rightmost children of `S0`, `S1`, `S2`, `B0` and `B1`. | - -This makes the state vector quite long: `13*T`, where `T` is the token vector -width (128 is working well). Fortunately, there's a way to structure the -computation to save some expense (and make it more GPU-friendly). - -The parser typically visits `2*N` states for a sentence of length `N` (although -it may visit more, if it back-tracks with a non-monotonic transition[^4]). A -naive implementation would require `2*N (B, 13*T) @ (13*T, H)` matrix -multiplications for a batch of size `B`. We can instead perform one -`(B*N, T) @ (T, 13*H)` multiplication, to pre-compute the hidden weights for -each positional feature with respect to the words in the batch. (Note that our -token vectors come from the CNN — so we can't play this trick over the -vocabulary. That's how Stanford's NN parser[^3] works — and why its model is so -big.) - -This pre-computation strategy allows a nice compromise between GPU-friendliness -and implementation simplicity. The CNN and the wide lower layer are computed on -the GPU, and then the precomputed hidden weights are moved to the CPU, before we -start the transition-based parsing process. This makes a lot of things much -easier. We don't have to worry about variable-length batch sizes, and we don't -have to implement the dynamic oracle in CUDA to train. - -Currently the parser's loss function is multi-label log loss[^6], as the dynamic -oracle allows multiple states to be 0 cost. This is defined as follows, where -`gZ` is the sum of the scores assigned to gold classes: - -```python -(exp(score) / Z) - (exp(score) / gZ) -``` - - - -1. [Simple and Accurate Dependency Parsing Using Bidirectional LSTM Feature Representations {#fn-1}](https://www.semanticscholar.org/paper/Simple-and-Accurate-Dependency-Parsing-Using-Bidir-Kiperwasser-Goldberg/3cf31ecb2724b5088783d7c96a5fc0d5604cbf41). - Eliyahu Kiperwasser, Yoav Goldberg. (2016) -2. [A Dynamic Oracle for Arc-Eager Dependency Parsing {#fn-2}](https://www.semanticscholar.org/paper/A-Dynamic-Oracle-for-Arc-Eager-Dependency-Parsing-Goldberg-Nivre/22697256ec19ecc3e14fcfc63624a44cf9c22df4). - Yoav Goldberg, Joakim Nivre (2012) -3. [Parsing English in 500 Lines of Python {#fn-3}](https://explosion.ai/blog/parsing-english-in-python). - Matthew Honnibal (2013) -4. [Stack-propagation: Improved Representation Learning for Syntax {#fn-4}](https://www.semanticscholar.org/paper/Stack-propagation-Improved-Representation-Learning-Zhang-Weiss/0c133f79b23e8c680891d2e49a66f0e3d37f1466). - Yuan Zhang, David Weiss (2016) -5. [Deep multi-task learning with low level tasks supervised at lower layers {#fn-5}](https://www.semanticscholar.org/paper/Deep-multi-task-learning-with-low-level-tasks-supe-S%C3%B8gaard-Goldberg/03ad06583c9721855ccd82c3d969a01360218d86). - Anders Søgaard, Yoav Goldberg (2016) -6. [An Improved Non-monotonic Transition System for Dependency Parsing {#fn-6}](https://www.semanticscholar.org/paper/An-Improved-Non-monotonic-Transition-System-for-De-Honnibal-Johnson/4094cee47ade13b77b5ab4d2e6cb9dd2b8a2917c). - Matthew Honnibal, Mark Johnson (2015) -7. [A Fast and Accurate Dependency Parser using Neural Networks {#fn-7}](http://cs.stanford.edu/people/danqi/papers/emnlp2014.pdf). - Danqi Cheng, Christopher D. Manning (2014) -8. [Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques {#fn-8}](https://www.semanticscholar.org/paper/Parsing-the-Wall-Street-Journal-using-a-Lexical-Fu-Riezler-King/0ad07862a91cd59b7eb5de38267e47725a62b8b2). - Stefan Riezler et al. (2002) - - - ## Model naming conventions {#conventions} In general, spaCy expects all model packages to follow the naming convention of diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md index 0dfee8636..36ef07111 100644 --- a/website/docs/usage/101/_named-entities.md +++ b/website/docs/usage/101/_named-entities.md @@ -1,10 +1,9 @@ A named entity is a "real-world object" that's assigned a name – for example, a -person, a country, a product or a book title. spaCy can **recognize -[various types](/api/annotation#named-entities)** of named entities in a -document, by asking the model for a **prediction**. Because models are -statistical and strongly depend on the examples they were trained on, this -doesn't always work _perfectly_ and might need some tuning later, depending on -your use case. +person, a country, a product or a book title. spaCy can **recognize various +types of named entities in a document, by asking the model for a +**prediction\*\*. Because models are statistical and strongly depend on the +examples they were trained on, this doesn't always work _perfectly_ and might +need some tuning later, depending on your use case. Named entities are available as the `ents` property of a `Doc`: diff --git a/website/docs/usage/examples.md b/website/docs/usage/examples.md deleted file mode 100644 index 854b2d42b..000000000 --- a/website/docs/usage/examples.md +++ /dev/null @@ -1,207 +0,0 @@ ---- -title: Examples -teaser: Full code examples you can modify and run -menu: - - ['Information Extraction', 'information-extraction'] - - ['Pipeline', 'pipeline'] - - ['Training', 'training'] - - ['Vectors & Similarity', 'vectors'] - - ['Deep Learning', 'deep-learning'] ---- - -## Information Extraction {#information-extraction hidden="true"} - -### Using spaCy's phrase matcher {#phrase-matcher new="2"} - -This example shows how to use the new [`PhraseMatcher`](/api/phrasematcher) to -efficiently find entities from a large terminology list. - -```python -https://github.com/explosion/spaCy/tree/master/examples/information_extraction/phrase_matcher.py -``` - -### Extracting entity relations {#entity-relations} - -A simple example of extracting relations between phrases and entities using -spaCy's named entity recognizer and the dependency parse. Here, we extract money -and currency values (entities labelled as `MONEY`) and then check the dependency -tree to find the noun phrase they are referring to – for example: -`"$9.4 million"` → `"Net income"`. - -```python -https://github.com/explosion/spaCy/tree/master/examples/information_extraction/entity_relations.py -``` - -### Navigating the parse tree and subtrees {#subtrees} - -This example shows how to navigate the parse tree including subtrees attached to -a word. - -```python -https://github.com/explosion/spaCy/tree/master/examples/information_extraction/parse_subtrees.py -``` - -## Pipeline {#pipeline hidden="true"} - -### Custom pipeline components and attribute extensions {#custom-components-entities new="2"} - -This example shows the implementation of a pipeline component that sets entity -annotations based on a list of single or multiple-word company names, merges -entities into one token and sets custom attributes on the `Doc`, `Span` and -`Token`. - -```python -https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_entities.py -``` - -### Custom pipeline components and attribute extensions via a REST API {#custom-components-api new="2"} - -This example shows the implementation of a pipeline component that fetches -country meta data via the [REST Countries API](https://restcountries.eu) sets -entity annotations for countries, merges entities into one token and sets custom -attributes on the `Doc`, `Span` and `Token` – for example, the capital, -latitude/longitude coordinates and the country flag. - -```python -https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_countries_api.py -``` - -### Custom method extensions {#custom-components-attr-methods new="2"} - -A collection of snippets showing examples of extensions adding custom methods to -the `Doc`, `Token` and `Span`. - -```python -https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_attr_methods.py -``` - -### Multi-processing with Joblib {#multi-processing} - -This example shows how to use multiple cores to process text using spaCy and -[Joblib](https://joblib.readthedocs.io/en/latest/). We're exporting -part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with -each "sentence" on a newline, and spaces between tokens. Data is loaded from the -IMDB movie reviews dataset and will be loaded automatically via Thinc's built-in -dataset loader. - -```python -https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processing.py -``` - -## Training {#training hidden="true"} - -### Training spaCy's Named Entity Recognizer {#training-ner} - -This example shows how to update spaCy's entity recognizer with your own -examples, starting off with an existing, pretrained model, or from scratch -using a blank `Language` class. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py -``` - -### Training an additional entity type {#new-entity-type} - -This script shows how to add a new entity type to an existing pretrained NER -model. To keep the example short and simple, only four sentences are provided as -examples. In practice, you'll need many more — a few hundred would be a good -start. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py -``` - -### Creating a Knowledge Base for Named Entity Linking {#kb} - -This example shows how to create a knowledge base in spaCy, -which is needed to implement entity linking functionality. -It requires as input a spaCy model with pretrained word vectors, -and it stores the KB to file (if an `output_dir` is provided). - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py -``` - -### Training spaCy's Named Entity Linker {#nel} - -This example shows how to train spaCy's entity linker with your own custom -examples, starting off with a predefined knowledge base and its vocab, -and using a blank `English` class. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py -``` - -### Training spaCy's Dependency Parser {#parser} - -This example shows how to update spaCy's dependency parser, starting off with an -existing, pretrained model, or from scratch using a blank `Language` class. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py -``` - -### Training spaCy's Part-of-speech Tagger {#tagger} - -In this example, we're training spaCy's part-of-speech tagger with a custom tag -map, mapping our own tags to the mapping those tags to the -[Universal Dependencies scheme](http://universaldependencies.github.io/docs/u/pos/index.html). - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_tagger.py -``` - -### Training a custom parser for chat intent semantics {#intent-parser} - -spaCy's parser component can be used to trained to predict any type of tree -structure over your input text. You can also predict trees over whole documents -or chat logs, with connections between the sentence-roots used to annotate -discourse structure. In this example, we'll build a message parser for a common -"chat intent": finding local businesses. Our message semantics will have the -following types of relations: `ROOT`, `PLACE`, `QUALITY`, `ATTRIBUTE`, `TIME` -and `LOCATION`. - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_intent_parser.py -``` - -### Training spaCy's text classifier {#textcat new="2"} - -This example shows how to train a multi-label convolutional neural network text -classifier on IMDB movie reviews, using spaCy's new -[`TextCategorizer`](/api/textcategorizer) component. The dataset will be loaded -automatically via Thinc's built-in dataset loader. Predictions are available via -[`Doc.cats`](/api/doc#attributes). - -```python -https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.py -``` - -## Vectors {#vectors hidden="true"} - -### Visualizing spaCy vectors in TensorBoard {#tensorboard} - -This script lets you load any spaCy model containing word vectors into -[TensorBoard](https://projector.tensorflow.org/) to create an -[embedding visualization](https://github.com/tensorflow/tensorboard/blob/master/docs/tensorboard_projector_plugin.ipynb). - -```python -https://github.com/explosion/spaCy/tree/master/examples/vectors_tensorboard.py -``` - -## Deep Learning {#deep-learning hidden="true"} - -### Text classification with Keras {#keras} - -This example shows how to use a [Keras](https://keras.io) LSTM sentiment -classification model in spaCy. spaCy splits the document into sentences, and -each sentence is classified using the LSTM. The scores for the sentences are -then aggregated to give the document score. This kind of hierarchical model is -quite difficult in "pure" Keras or TensorFlow, but it's very effective. The -Keras example on this dataset performs quite poorly, because it cuts off the -documents so that they're a fixed size. This hurts review accuracy a lot, -because people often summarize their rating in the final sentence. - -```python -https://github.com/explosion/spaCy/tree/master/examples/deep_learning_keras.py -``` diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 051d6a060..bda9f76d6 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -370,23 +370,6 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`. - - -```python -doc = nlp("They are") -print(doc[0].lemma_) -# -PRON- -``` - -This is in fact expected behavior and not a bug. Unlike verbs and common nouns, -there's no clear base form of a personal pronoun. Should the lemma of "me" be -"I", or should we normalize person as well, giving "it" — or maybe "he"? spaCy's -solution is to introduce a novel symbol, `-PRON-`, which is used as the lemma -for all personal pronouns. For more info on this, see the -[lemmatization specs](/api/annotation#lemmatization). - - - If your training data only contained new entities and you didn't mix in any diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index faa6dc850..99612a6bb 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -2,11 +2,11 @@ title: Linguistic Features next: /usage/rule-based-matching menu: - - ['Tokenization', 'tokenization'] - ['POS Tagging', 'pos-tagging'] - ['Dependency Parse', 'dependency-parse'] - ['Named Entities', 'named-entities'] - ['Entity Linking', 'entity-linking'] + - ['Tokenization', 'tokenization'] - ['Merging & Splitting', 'retokenization'] - ['Sentence Segmentation', 'sbd'] - ['Language data', 'language-data'] @@ -31,8 +31,8 @@ import PosDeps101 from 'usage/101/\_pos-deps.md' For a list of the fine-grained and coarse-grained part-of-speech tags assigned -by spaCy's models across different languages, see the -[POS tag scheme documentation](/api/annotation#pos-tagging). +by spaCy's models across different languages, see the label schemes documented +in the [models directory](/models). @@ -290,8 +290,8 @@ for token in doc: For a list of the syntactic dependency labels assigned by spaCy's models across -different languages, see the -[dependency label scheme documentation](/api/annotation#dependency-parsing). +different languages, see the label schemes documented in the +[models directory](/models). @@ -354,7 +354,7 @@ import NER101 from 'usage/101/\_named-entities.md' -### Accessing entity annotations {#accessing} +### Accessing entity annotations and labels {#accessing-ner} The standard way to access entity annotations is the [`doc.ents`](/api/doc#ents) property, which produces a sequence of [`Span`](/api/span) objects. The entity @@ -371,9 +371,17 @@ on a token, it will return an empty string. > #### IOB Scheme > -> - `I` – Token is inside an entity. -> - `O` – Token is outside an entity. -> - `B` – Token is the beginning of an entity. +> - `I` – Token is **inside** an entity. +> - `O` – Token is **outside** an entity. +> - `B` – Token is the **beginning** of an entity. +> +> #### BILUO Scheme +> +> - `B` – Token is the **beginning** of an entity. +> - `I` – Token is **inside** a multi-token entity. +> - `L` – Token is the **last** token of a multi-token entity. +> - `U` – Token is a single-token **unit** entity. +> - `O` – Toke is **outside** an entity. ```python ### {executable="true"} @@ -492,38 +500,8 @@ responsibility for ensuring that the data is left in a consistent state. For details on the entity types available in spaCy's pretrained models, see the -[NER annotation scheme](/api/annotation#named-entities). - - - -### Training and updating {#updating} - -To provide training examples to the entity recognizer, you'll first need to -create an instance of the [`GoldParse`](/api/goldparse) class. You can specify -your annotations in a stand-off format or as token tags. If a character offset -in your entity annotations doesn't fall on a token boundary, the `GoldParse` -class will treat that annotation as a missing value. This allows for more -realistic training, because the entity recognizer is allowed to learn from -examples that may feature tokenizer errors. - -```python -train_data = [ - ("Who is Chaka Khan?", [(7, 17, "PERSON")]), - ("I like London and Berlin.", [(7, 13, "LOC"), (18, 24, "LOC")]), -] -``` - -```python -doc = Doc(nlp.vocab, ["rats", "make", "good", "pets"]) -gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"]) -``` - - - -For more details on **training and updating** the named entity recognizer, see -the usage guides on [training](/usage/training) or check out the runnable -[training script](https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py) -on GitHub. +"label scheme" sections of the individual models in the +[models directory](/models). @@ -1103,7 +1081,7 @@ In situations like that, you often want to align the tokenization so that you can merge annotations from different sources together, or take vectors predicted by a [pretrained BERT model](https://github.com/huggingface/pytorch-transformers) and -apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align) helper +apply them to spaCy tokens. spaCy's [`gold.align`](/api/top-level#align) helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the number of misaligned tokens, the one-to-one mappings of token indices in both directions and the indices where multiple tokens align to one single token. diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 6b32dc422..32d6bf7a2 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1,6 +1,6 @@ --- title: Language Processing Pipelines -next: vectors-similarity +next: /usage/vectors-embeddings menu: - ['Processing Text', 'processing'] - ['How Pipelines Work', 'pipelines'] @@ -818,14 +818,14 @@ function that takes a `Doc`, modifies it and returns it. ### Wrapping other models and libraries {#wrapping-models-libraries} Let's say you have a custom entity recognizer that takes a list of strings and -returns their [BILUO tags](/api/annotation#biluo). Given an input like -`["A", "text", "about", "Facebook"]`, it will predict and return +returns their [BILUO tags](/usage/linguistic-features#accessing-ner). Given an +input like `["A", "text", "about", "Facebook"]`, it will predict and return `["O", "O", "O", "U-ORG"]`. To integrate it into your spaCy pipeline and make it add those entities to the `doc.ents`, you can wrap it in a custom pipeline component function and pass it the token texts from the `Doc` object received by the component. -The [`gold.spans_from_biluo_tags`](/api/goldparse#spans_from_biluo_tags) is very +The [`gold.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very helpful here, because it takes a `Doc` object and token-based BILUO tags and returns a sequence of `Span` objects in the `Doc` with added labels. So all your wrapper has to do is compute the entity spans and overwrite the `doc.ents`. diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index d0ee44e49..e89e41586 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -1289,10 +1289,9 @@ print([(ent.text, ent.label_, ent._.person_title) for ent in doc.ents]) > > This example makes extensive use of part-of-speech tag and dependency > attributes and related `Doc`, `Token` and `Span` methods. For an introduction -> on this, see the guide on -> [linguistic features](http://localhost:8000/usage/linguistic-features/). Also -> see the [annotation specs](/api/annotation#pos-tagging) for details on the -> label schemes. +> on this, see the guide on [linguistic features](/usage/linguistic-features/). +> Also see the label schemes in the [models directory](/models) for details on +> the labels. Let's say you want to parse professional biographies and extract the person names and company names, and whether it's a company they're _currently_ working diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index aa8aa59af..3c4e85a7d 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -249,7 +249,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md' To learn more about word vectors, how to **customize them** and how to load **your own vectors** into spaCy, see the usage guide on -[using word vectors and semantic similarities](/usage/vectors-similarity). +[using word vectors and semantic similarities](/usage/vectors-embeddings). @@ -712,7 +712,7 @@ not available in the live demo). -**Usage:** [Word vectors and similarity](/usage/vectors-similarity) +**Usage:** [Word vectors and similarity](/usage/vectors-embeddings) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 6fa0b3d8e..53b713f98 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -10,9 +10,7 @@ menu: - ['Internal API', 'api'] --- - - -## Introduction to training models {#basics} +## Introduction to training models {#basics hidden="true"} import Training101 from 'usage/101/\_training.md' @@ -33,10 +31,13 @@ ready-to-use spaCy models. ## Training CLI & config {#cli-config} + + The recommended way to train your spaCy models is via the [`spacy train`](/api/cli#train) command on the command line. -1. The **training data** in spaCy's binary format created using +1. The **training data** in spaCy's + [binary format](/api/data-formats#binary-training) created using [`spacy convert`](/api/cli#convert). 2. A `config.cfg` **configuration file** with all settings and hyperparameters. 3. An optional **Python file** to register @@ -44,9 +45,13 @@ The recommended way to train your spaCy models is via the -### Training data format {#data-format} + - +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + > #### Tip: Debug your data > @@ -158,6 +163,14 @@ dropout = null + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + ### Training with custom code @@ -168,6 +181,14 @@ dropout = null + + +Try out a BERT-based model pipeline using this project template: swap in your +data, edit the settings and hyperparameters and train, evaluate, package and +visualize your model. + + + ### Pretraining with spaCy {#pretraining} @@ -176,6 +197,14 @@ dropout = null + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + ## Internal training API {#api} @@ -259,5 +288,5 @@ The [`nlp.update`](/api/language#update) method takes the following arguments: Instead of writing your own training loop, you can also use the built-in [`train`](/api/cli#train) command, which expects data in spaCy's -[JSON format](/api/annotation#json-input). On each epoch, a model will be saved -out to the directory. +[JSON format](/api/data-formats#json-input). On each epoch, a model will be +saved out to the directory. diff --git a/website/docs/usage/v2-2.md b/website/docs/usage/v2-2.md index 19a0434fb..dd7325a9c 100644 --- a/website/docs/usage/v2-2.md +++ b/website/docs/usage/v2-2.md @@ -351,7 +351,7 @@ check if all of your models are up to date, you can run the automatically to prevent spaCy from being downloaded and installed again from pip. - The built-in - [`biluo_tags_from_offsets`](/api/goldparse#biluo_tags_from_offsets) converter + [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) converter is now stricter and will raise an error if entities are overlapping (instead of silently skipping them). If your data contains invalid entity annotations, make sure to clean it and resolve conflicts. You can now also use the new @@ -430,7 +430,7 @@ lemma_rules = {"verb": [["ing", ""]]} #### Converting entity offsets to BILUO tags If you've been using the -[`biluo_tags_from_offsets`](/api/goldparse#biluo_tags_from_offsets) helper to +[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) helper to convert character offsets into token-based BILUO tags, you may now see an error if the offsets contain overlapping tokens and make it impossible to create a valid BILUO sequence. This is helpful, because it lets you spot potential diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index a2322c3be..59a842968 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -169,8 +169,8 @@ network to assign position-sensitive vectors to each word in the document. **API:** [`TextCategorizer`](/api/textcategorizer), -[`Doc.cats`](/api/doc#attributes), [`GoldParse.cats`](/api/goldparse#attributes) -**Usage:** [Training a text classification model](/usage/training#textcat) +[`Doc.cats`](/api/doc#attributes), `GoldParse.cats` **Usage:** +[Training a text classification model](/usage/training#textcat) @@ -218,7 +218,7 @@ available via `token.orth`. The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors assigned to strings, and lets you assign vectors individually, or -[load in GloVe vectors](/usage/vectors-similarity#custom-loading-glove) from a +[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a directory. To help you strike a good balance between coverage and memory usage, the `Vectors` class lets you map **multiple keys** to the **same row** of the table. If you're using the [`spacy init-model`](/api/cli#init-model) command to diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-embeddings.md similarity index 95% rename from website/docs/usage/vectors-similarity.md rename to website/docs/usage/vectors-embeddings.md index 9b65bb80a..49b651d9e 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-embeddings.md @@ -1,12 +1,13 @@ --- -title: Word Vectors and Semantic Similarity +title: Word Vectors and Embeddings menu: - - ['Basics', 'basics'] - - ['Custom Vectors', 'custom'] - - ['GPU Usage', 'gpu'] + - ['Word Vectors', 'vectors'] + - ['Other Embeddings', 'embeddings'] --- -## Basics {#basics hidden="true"} + + +## Word vectors and similarity > #### Training word vectors > @@ -21,7 +22,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md' -## Customizing word vectors {#custom} +### Customizing word vectors {#custom} Word vectors let you import knowledge from raw text into your model. The knowledge is represented as a table of numbers, with one row per term in your @@ -193,7 +194,7 @@ For more details on **adding hooks** and **overwriting** the built-in `Doc`, -## Storing vectors on a GPU {#gpu} +### Storing vectors on a GPU {#gpu} If you're using a GPU, it's much more efficient to keep the word vectors on the device. You can do that by setting the [`Vectors.data`](/api/vectors#attributes) @@ -224,3 +225,7 @@ vector_table = numpy.zeros((3, 300), dtype="f") vectors = Vectors(["dog", "cat", "orange"], vector_table) vectors.data = torch.Tensor(vectors.data).cuda(0) ``` + +## Other embeddings {#embeddings} + + diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index df4987a62..6b533b739 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -130,10 +130,9 @@ If you specify a list of `ents`, only those entity types will be rendered – fo example, you can choose to display `PERSON` entities. Internally, the visualizer knows nothing about available entity types and will render whichever spans and labels it receives. This makes it especially easy to work with custom entity -types. By default, displaCy comes with colors for all -[entity types supported by spaCy](/api/annotation#named-entities). If you're -using custom entity types, you can use the `colors` setting to add your own -colors for them. +types. By default, displaCy comes with colors for all entity types used by +[spaCy models](/models). If you're using custom entity types, you can use the +`colors` setting to add your own colors for them. > #### Options example > diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 9a0d0fb05..18b14751e 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -18,7 +18,7 @@ { "text": "Linguistic Features", "url": "/usage/linguistic-features" }, { "text": "Rule-based Matching", "url": "/usage/rule-based-matching" }, { "text": "Processing Pipelines", "url": "/usage/processing-pipelines" }, - { "text": "Vectors & Similarity", "url": "/usage/vectors-similarity" }, + { "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" }, { "text": "Training Models", "url": "/usage/training", "tag": "new" }, { "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" }, { "text": "Saving & Loading", "url": "/usage/saving-loading" }, @@ -26,8 +26,10 @@ ] }, { - "label": "In-depth", - "items": [{ "text": "Code Examples", "url": "/usage/examples" }] + "label": "Resources", + "items": [ + { "text": "Project Templates", "url": "https://github.com/explosion/projects" } + ] } ] }, @@ -56,7 +58,7 @@ "items": [ { "text": "Library Architecture", "url": "/api" }, { "text": "Model Architectures", "url": "/api/architectures" }, - { "text": "Annotation Specs", "url": "/api/annotation" }, + { "text": "Data Formats", "url": "/api/data-formats" }, { "text": "Command Line", "url": "/api/cli" }, { "text": "Functions", "url": "/api/top-level" } ] diff --git a/website/src/components/copy.js b/website/src/components/copy.js new file mode 100644 index 000000000..4392273e2 --- /dev/null +++ b/website/src/components/copy.js @@ -0,0 +1,48 @@ +import React, { useState, useRef } from 'react' + +import Icon from './icon' +import classes from '../styles/copy.module.sass' + +const CopyInput = ({ text, prefix }) => { + const isClient = typeof window !== 'undefined' + const supportsCopy = isClient && document.queryCommandSupported('copy') + const textareaRef = useRef() + const [copySuccess, setCopySuccess] = useState(false) + + function copyToClipboard() { + if (textareaRef.current && isClient) { + textareaRef.current.select() + document.execCommand('copy') + setCopySuccess(true) + textareaRef.current.blur() + setTimeout(() => setCopySuccess(false), 1000) + } + } + + function selectText() { + if (textareaRef.current && isClient) { + textareaRef.current.select() + } + } + + return ( +
+ {prefix && {prefix}} +