diff --git a/.gitignore b/.gitignore index 136a8f26d..4dbcd67f7 100644 --- a/.gitignore +++ b/.gitignore @@ -18,8 +18,6 @@ website/.npm website/logs *.log npm-debug.log* -website/www/ -website/_deploy.sh quickstart-training-generator.js # Cython / C extensions diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 81cfbf8cb..0abde2abf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ Thanks for your interest in contributing to spaCy 🎉 The project is maintained by [@honnibal](https://github.com/honnibal) and [@ines](https://github.com/ines), and we'll do our best to help you get started. This page will give you a quick -overview of how things are organised and most importantly, how to get involved. +overview of how things are organized and most importantly, how to get involved. ## Table of contents @@ -195,7 +195,7 @@ modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** ### Code formatting [`black`](https://github.com/ambv/black) is an opinionated Python code -formatter, optimised to produce readable code and small diffs. You can run +formatter, optimized to produce readable code and small diffs. You can run `black` from the command-line, or via your code editor. For example, if you're using [Visual Studio Code](https://code.visualstudio.com/), you can add the following to your `settings.json` to use `black` for formatting and auto-format @@ -286,7 +286,7 @@ Code that interacts with the file-system should accept objects that follow the If the function is user-facing and takes a path as an argument, it should check whether the path is provided as a string. Strings should be converted to `pathlib.Path` objects. Serialization and deserialization functions should always -accept **file-like objects**, as it makes the library io-agnostic. Working on +accept **file-like objects**, as it makes the library IO-agnostic. Working on buffers makes the code more general, easier to test, and compatible with Python 3's asynchronous IO. @@ -384,7 +384,7 @@ of Python and C++, with additional complexity and syntax from numpy. The many "traps for new players". Working in Cython is very rewarding once you're over the initial learning curve. As with C and C++, the first way you write something in Cython will often be the performance-optimal approach. In contrast, -Python optimisation generally requires a lot of experimentation. Is it faster to +Python optimization generally requires a lot of experimentation. Is it faster to have an `if item in my_dict` check, or to use `.get()`? What about `try`/`except`? Does this numpy operation create a copy? There's no way to guess the answers to these questions, and you'll usually be dissatisfied with your results — so @@ -400,7 +400,7 @@ Python. If it's not fast enough the first time, just switch to Cython. - [PEP 8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/) (python.org) - [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org) - [Writing C in Cython](https://explosion.ai/blog/writing-c-in-cython) (explosion.ai) -- [Multi-threading spaCy’s parser and named entity recogniser](https://explosion.ai/blog/multithreading-with-cython) (explosion.ai) +- [Multi-threading spaCy’s parser and named entity recognizer](https://explosion.ai/blog/multithreading-with-cython) (explosion.ai) ## Adding tests @@ -412,7 +412,7 @@ name. For example, tests for the `Tokenizer` can be found in all test files and test functions need to be prefixed with `test_`. When adding tests, make sure to use descriptive names, keep the code short and -concise and only test for one behaviour at a time. Try to `parametrize` test +concise and only test for one behavior at a time. Try to `parametrize` test cases wherever possible, use our pre-defined fixtures for spaCy components and avoid unnecessary imports. diff --git a/README.md b/README.md index 1fece1e5a..cef2a1fdd 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,8 @@ It's commercial open-source software, released under the MIT license. ## 💬 Where to ask questions -The spaCy project is maintained by [@honnibal](https://github.com/honnibal) and -[@ines](https://github.com/ines), along with core contributors -[@svlandeg](https://github.com/svlandeg) and +The spaCy project is maintained by [@honnibal](https://github.com/honnibal), +[@ines](https://github.com/ines), [@svlandeg](https://github.com/svlandeg) and [@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 9b47dea14..94e0bd6fc 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -24,7 +24,7 @@ class Optimizations(str, Enum): @init_cli.command("config") def init_config_cli( # fmt: off - output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), + output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True), lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), @@ -110,6 +110,13 @@ def init_config( "word_vectors": reco["word_vectors"], "has_letters": reco["has_letters"], } + if variables["transformer_data"] and not has_spacy_transformers(): + msg.warn( + "To generate a more effective transformer-based config (GPU-only), " + "install the spacy-transformers package and re-run this command. " + "The config generated now does not use transformers." + ) + variables["transformer_data"] = None base_template = template.render(variables).strip() # Giving up on getting the newlines right in jinja for now base_template = re.sub(r"\n\n\n+", "\n\n", base_template) @@ -126,8 +133,6 @@ def init_config( for label, value in use_case.items(): msg.text(f"- {label}: {value}") use_transformer = bool(template_vars.use_transformer) - if use_transformer: - require_spacy_transformers(msg) with show_validation_error(hint_fill=False): config = util.load_config_from_str(base_template) nlp, _ = util.load_model_from_config(config, auto_fill=True) @@ -149,12 +154,10 @@ def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> N print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}") -def require_spacy_transformers(msg: Printer) -> None: +def has_spacy_transformers() -> bool: try: import spacy_transformers # noqa: F401 + + return True except ImportError: - msg.fail( - "Using a transformer-based pipeline requires spacy-transformers " - "to be installed.", - exits=1, - ) + return False diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 674099abc..0071f1b1a 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -107,8 +107,8 @@ factory = "tok2vec" @architectures = "spacy.MultiHashEmbed.v1" width = ${components.tok2vec.model.encode.width} rows = {{ 2000 if optimize == "efficiency" else 7000 }} -also_embed_subwords = {{ true if has_letters else false }} -also_use_static_vectors = {{ true if optimize == "accuracy" else false }} +also_embed_subwords = {{ "true" if has_letters else "false" }} +also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" @@ -195,7 +195,7 @@ initial_rate = 5e-5 [training.train_corpus] @readers = "spacy.Corpus.v1" path = ${paths.train} -max_length = {{ 500 if hardware == "gpu" else 0 }} +max_length = {{ 500 if hardware == "gpu" else 2000 }} [training.dev_corpus] @readers = "spacy.Corpus.v1" diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 69f6df8f0..07550f9aa 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -252,8 +252,10 @@ class EntityRenderer: colors.update(user_color) colors.update(options.get("colors", {})) self.default_color = DEFAULT_ENTITY_COLOR - self.colors = colors + self.colors = {label.upper(): color for label, color in colors.items()} self.ents = options.get("ents", None) + if self.ents is not None: + self.ents = [ent.upper() for ent in self.ents] self.direction = DEFAULT_DIR self.lang = DEFAULT_LANG template = options.get("template") diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ff99000f4..b9cbf717b 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -51,14 +51,14 @@ TPL_ENTS = """ TPL_ENT = """ {text} - {label} + {label} """ TPL_ENT_RTL = """ {text} - {label} + {label} """ diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index adac0f7c3..1fa0eeaa1 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,6 +1,6 @@ import pytest from spacy import displacy -from spacy.displacy.render import DependencyRenderer +from spacy.displacy.render import DependencyRenderer, EntityRenderer from spacy.tokens import Span from spacy.lang.fa import Persian @@ -97,3 +97,17 @@ def test_displacy_render_wrapper(en_vocab): assert html.endswith("/div>TEST") # Restore displacy.set_render_wrapper(lambda html: html) + + +def test_displacy_options_case(): + ents = ["foo", "BAR"] + colors = {"FOO": "red", "bar": "green"} + renderer = EntityRenderer({"ents": ents, "colors": colors}) + text = "abcd" + labels = ["foo", "bar", "FOO", "BAR"] + spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))] + result = renderer.render_ents("abcde", spans, None).split("\n\n") + assert "red" in result[0] and "foo" in result[0] + assert "green" in result[1] and "bar" in result[1] + assert "red" in result[2] and "FOO" in result[2] + assert "green" in result[3] and "BAR" in result[3] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index a13299fff..9fda1800b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -47,9 +47,9 @@ cdef class Tokenizer: `infix_finditer` (callable): A function matching the signature of `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be - recognised as tokens. + recognized as tokens. url_match (callable): A boolean function matching strings to be - recognised as tokens after considering prefixes and suffixes. + recognized as tokens after considering prefixes and suffixes. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index acdf4cb19..835815496 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -399,7 +399,7 @@ one component. > subword_features = true > ``` -Build a transition-based parser model. Can apply to NER or dependency-parsing. +Build a transition-based parser model. Can apply to NER or dependency parsing. Transition-based parsing is an approach to structured prediction where the task of predicting the structure is mapped to a series of state transitions. You might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python) @@ -416,8 +416,6 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. - - | Name | Description | | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | @@ -426,7 +424,7 @@ consists of either two or three subnetworks: | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | | `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | -| **CREATES** | The model using the architecture. ~~Model~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | ### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"} diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index 1b2e159d0..5d5324061 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -7,7 +7,7 @@ source: spacy/morphology.pyx Store the possible morphological analyses for a language, and index them by hash. To save space on each token, tokens only know the hash of their morphological analysis, so queries of morphological attributes are delegated to -this class. See [`MorphAnalysis`](/api/morphology#morphansalysis) for the +this class. See [`MorphAnalysis`](/api/morphology#morphanalysis) for the container storing a single morphological analysis. ## Morphology.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 4a8e6eba7..0860797aa 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -450,8 +450,8 @@ The L2 norm of the token's vector representation. | `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | | `tag` | Fine-grained part-of-speech. ~~int~~ | | `tag_` | Fine-grained part-of-speech. ~~str~~ | -| `morph` | Morphological analysis. ~~MorphAnalysis~~ | -| `morph_` | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ | +| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | +| `morph_` 3 | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ | | `dep` | Syntactic dependency relation. ~~int~~ | | `dep_` | Syntactic dependency relation. ~~str~~ | | `lang` | Language of the parent document's vocabulary. ~~int~~ | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 61fca6ec5..9c65b2982 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -257,7 +257,7 @@ If a setting is not present in the options, the default value will be used. | Name | Description | | --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types in uppercase should be mapped to color names or values. ~~Dict[str, str]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | | `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by @@ -632,6 +632,23 @@ validate its contents. | `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ | | **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ | +### util.get_installed_models {#util.get_installed_models tag="function" new="3"} + +List all model packages installed in the current environment. This will include +any spaCy model that was packaged with [`spacy package`](/api/cli#package). +Under the hood, model packages expose a Python entry point that spaCy can check, +without having to load the model. + +> #### Example +> +> ```python +> model_names = util.get_installed_models() +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------- | +| **RETURNS** | The string names of the models installed in the current environment. ~~List[str]~~ | + ### util.is_package {#util.is_package tag="function"} Check if string maps to a package installed via pip. Mainly used to validate diff --git a/website/docs/images/sense2vec.jpg b/website/docs/images/sense2vec.jpg new file mode 100644 index 000000000..3a1772582 Binary files /dev/null and b/website/docs/images/sense2vec.jpg differ diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index a04c96236..92df1b331 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -80,25 +80,73 @@ duplicate if it's very similar to an already existing one. Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and [`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity) method that lets you compare it with another object, and determine the -similarity. Of course similarity is always subjective – whether "dog" and "cat" -are similar really depends on how you're looking at it. spaCy's similarity model -usually assumes a pretty general-purpose definition of similarity. +similarity. Of course similarity is always subjective – whether two words, spans +or documents are similar really depends on how you're looking at it. spaCy's +similarity model usually assumes a pretty general-purpose definition of +similarity. - +> #### 📝 Things to try +> +> 1. Compare two different tokens and try to find the two most _dissimilar_ +> tokens in the texts with the lowest similarity score (according to the +> vectors). +> 2. Compare the similarity of two [`Lexeme`](/api/lexeme) objects, entries in +> the vocabulary. You can get a lexeme via the `.lex` attribute of a token. +> You should see that the similarity results are identical to the token +> similarity. ```python ### {executable="true"} import spacy nlp = spacy.load("en_core_web_md") # make sure to use larger model! -tokens = nlp("dog cat banana") +doc1 = nlp("I like salty fries and hamburgers.") +doc2 = nlp("Fast food tastes very good.") -for token1 in tokens: - for token2 in tokens: - print(token1.text, token2.text, token1.similarity(token2)) +# Similarity of two documents +print(doc1, "<->", doc2, doc1.similarity(doc2)) +# Similarity of tokens and spans +french_fries = doc1[2:4] +burgers = doc1[5] +print(french_fries, "<->", burgers, french_fries.similarity(burgers)) ``` -In this case, the model's predictions are pretty on point. A dog is very similar -to a cat, whereas a banana is not very similar to either of them. Identical -tokens are obviously 100% similar to each other (just not always exactly `1.0`, -because of vector math and floating point imprecisions). +### What to expect from similarity results {#similarity-expectations} + +Computing similarity scores can be helpful in many situations, but it's also +important to maintain **realistic expectations** about what information it can +provide. Words can be related to each over in many ways, so a single +"similarity" score will always be a **mix of different signals**, and vectors +trained on different data can produce very different results that may not be +useful for your purpose. Here are some important considerations to keep in mind: + +- There's no objective definition of similarity. Whether "I like burgers" and "I + like pasta" is similar **depends on your application**. Both talk about food + preferences, which makes them very similar – but if you're analyzing mentions + of food, those sentences are pretty dissimilar, because they talk about very + different foods. +- The similarity of [`Doc`](/api/doc) and [`Span`](/api/span) objects defaults + to the **average** of the token vectors. This means that the vector for "fast + food" is the average of the vectors for "fast" and "food", which isn't + necessarily representative of the phrase "fast food". +- Vector averaging means that the vector of multiple tokens is **insensitive to + the order** of the words. Two documents expressing the same meaning with + dissimilar wording will return a lower similarity score than two documents + that happen to contain the same words while expressing different meanings. + + + +[![](../../images/sense2vec.jpg)](https://github.com/explosion/sense2vec) + +[`sense2vec`](https://github.com/explosion/sense2vec) is a library developed by +us that builds on top of spaCy and lets you train and query more interesting and +detailed word vectors. It combines noun phrases like "fast food" or "fair game" +and includes the part-of-speech tags and entity labels. The library also +includes annotation recipes for our annotation tool [Prodigy](https://prodi.gy) +that let you evaluate vector models and create terminology lists. For more +details, check out +[our blog post](https://explosion.ai/blog/sense2vec-reloaded). To explore the +semantic similarities across all Reddit comments of 2015 and 2019, see the +[interactive demo](https://explosion.ai/demos/sense2vec). + + diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index c2727f5b1..33385ff51 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -11,6 +11,10 @@ next: /usage/training +If you're looking for details on using word vectors and semantic similarity, +check out the +[linguistic features docs](/usage/linguistic-features#vectors-similarity). + The key difference between [word vectors](#word-vectors) and contextual language @@ -180,7 +184,7 @@ yourself. For details on how to get started with training your own model, check out the [training quickstart](/usage/training#quickstart). Training config files include all **settings and hyperparameters** for training your model. Instead of providing lots of arguments on the command line, you only @@ -404,11 +404,15 @@ recipe once the dish has already been prepared. You have to make a new one. spaCy includes a variety of built-in [architectures](/api/architectures) for different tasks. For example: - + -| Architecture | Description | -| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [HashEmbedCNN](/api/architectures#HashEmbedCNN) | Build spaCy’s “standard” embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. ~~Model[List[Doc], List[Floats2d]]~~ | +| Architecture | Description | +| ----------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [HashEmbedCNN](/api/architectures#HashEmbedCNN) | Build spaCy’s "standard" embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. ~~Model[List[Doc], List[Floats2d]]~~ | +| [TransitionBasedParser](/api/architectures#TransitionBasedParser) | Build a [transition-based parser](https://explosion.ai/blog/parsing-english-in-python) model used in the default [`EntityRecognizer`](/api/entityrecognizer) and [`DependencyParser`](/api/dependencyparser). ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| [TextCatEnsemble](/api/architectures#TextCatEnsemble) | Stacked ensemble of a bag-of-words model and a neural network model with an internal CNN embedding layer. Used in the default [`TextCategorizer`](/api/textcategorizer). ~~Model~~ | + + ### Metrics, training output and weighted scores {#metrics} @@ -788,7 +792,7 @@ you save the transformer outputs for later use. + +- **Thinc: ** + [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks) +- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe) + + + ### Manage end-to-end workflows with projects {#features-projects} + + +> #### Example +> +> ```cli +> # Clone a project template +> $ python -m spacy project clone example +> $ cd example +> # Download data assets +> $ python -m spacy project assets +> # Run a workflow +> $ python -m spacy project run train +> ``` + +spaCy projects let you manage and share **end-to-end spaCy workflows** for +different **use cases and domains**, and orchestrate training, packaging and +serving your custom models. You can start off by cloning a pre-defined project +template, adjust it to fit your needs, load in your data, train a model, export +it as a Python package and share the project templates with your team. spaCy +projects also make it easy to **integrate with other tools** in the data science +and machine learning ecosystem, including [DVC](/usage/projects#dvc) for data +version control, [Prodigy](/usage/projects#prodigy) for creating labelled data, +[Streamlit](/usage/projects#streamlit) for building interactive apps, +[FastAPI](/usage/projects#fastapi) for serving models in production, +[Ray](/usage/projects#ray) for parallel training, +[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more! + + + - **Usage:** [spaCy projects](/usage/projects), @@ -59,6 +132,16 @@ menu: ### New built-in pipeline components {#features-pipeline-components} +spaCy v3.0 includes several new trainable and rule-based components that you can +add to your pipeline and customize for your use case: + +> #### Example +> +> ```python +> nlp = spacy.blank("en") +> nlp.add_pipe("lemmatizer") +> ``` + | Name | Description | | ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | @@ -78,15 +161,37 @@ menu: ### New and improved pipeline component APIs {#features-components} -- `Language.factory`, `Language.component` -- `Language.analyze_pipes` -- Adding components from other models +> #### Example +> +> ```python +> @Language.component("my_component") +> def my_component(doc): +> return doc +> +> nlp.add_pipe("my_component") +> nlp.add_pipe("ner", source=other_nlp) +> nlp.analyze_pipes(pretty=True) +> ``` + +Defining, configuring, reusing, training and analyzing pipeline components is +now easier and more convenient. The `@Language.component` and +`@Language.factory` decorators let you register your component, define its +default configuration and meta data, like the attribute values it assigns and +requires. Any custom component can be included during training, and sourcing +components from existing pretrained models lets you **mix and match custom +pipelines**. The `nlp.analyze_pipes` method outputs structured information about +the current pipeline and its components, including the attributes they assign, +the scores they compute during training and whether any required attributes +aren't set. - **Usage:** [Custom components](/usage/processing-pipelines#custom_components), - [Defining components during training](/usage/training#config-components) -- **API:** [`Language`](/api/language) + [Defining components for training](/usage/training#config-components) +- **API:** [`@Language.component`](/api/language#component), + [`@Language.factory`](/api/language#factory), + [`Language.add_pipe`](/api/language#add_pipe), + [`Language.analyze_pipes`](/api/language#analyze_pipes) - **Implementation:** [`spacy/language.py`](https://github.com/explosion/spaCy/tree/develop/spacy/language.py) @@ -136,13 +241,14 @@ in your config and see validation errors if the argument values don't match. -### New methods, attributes and commands +### New methods, attributes and commands {#new-methods} The following methods, attributes and commands are new in spaCy v3.0. | Name | Description | | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | +| [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | | [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | @@ -153,9 +259,53 @@ The following methods, attributes and commands are new in spaCy v3.0. | [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | | [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | | [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | +| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all models installed in the environment. | | [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | | [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | +### New and updated documentation {#new-docs} + + + +
+ +To help you get started with spaCy v3.0 and the new features, we've added +several new or rewritten documentation pages, including a new usage guide on +[embeddings, transformers and transfer learning](/usage/embeddings-transformers), +a guide on [training models](/usage/training) rewritten from scratch, a page +explaining the new [spaCy projects](/usage/projects) and updated usage +documentation on +[custom pipeline components](/usage/processing-pipelines#custom-components). +We've also added a bunch of new illustrations and new API reference pages +documenting spaCy's machine learning [model architectures](/api/architectures) +and the expected [data formats](/api/data-formats). API pages about +[pipeline components](/api/#architecture-pipeline) now include more information, +like the default config and implementation, and we've adopted a more detailed +format for documenting argument and return types. + +
+ +[![Library architecture](../images/architecture.svg)](/api) + +
+ + + +- **Usage: ** [Embeddings & Transformers](/usage/embeddings-transformers), + [Training models](/usage/training), [Projects](/usage/projects), + [Custom pipeline components](/usage/processing-pipelines#custom-components), + [Custom tokenizers](/usage/linguistic-features#custom-tokenizer) +- **API Reference: ** [Library architecture](/api), + [Model architectures](/api/architectures), [Data formats](/api/data-formats) +- **New Classes: ** [`Example`](/api/example), [`Tok2Vec`](/api/tok2vec), + [`Transformer`](/api/transformer), [`Lemmatizer`](/api/lemmatizer), + [`Morphologizer`](/api/morphologizer), + [`AttributeRuler`](/api/attributeruler), + [`SentenceRecognizer`](/api/sentencerecognizer), [`Pipe`](/api/pipe), + [`Corpus`](/api/corpus) + + + ## Backwards Incompatibilities {#incompat} As always, we've tried to keep the breaking changes to a minimum and focus on @@ -212,15 +362,16 @@ Note that spaCy v3.0 now requires **Python 3.6+**. ### Removed or renamed API {#incompat-removed} -| Removed | Replacement | -| ------------------------------------------------------ | ----------------------------------------------------------------------------------------- | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `KnowledgeBase.load_bulk` `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk) [`KnowledgeBase.to_disk`](/api/kb#to_disk) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | -| `spacy link` `util.set_data_path` `util.get_data_path` | not needed, model symlinks are deprecated | +| Removed | Replacement | +| -------------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `KnowledgeBase.load_bulk` `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk) [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously @@ -236,7 +387,7 @@ on them. | `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | | keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | | `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | -| `verbose` argument on [`Language.evaluate`] | logging | +| `verbose` argument on [`Language.evaluate`](/api/language#evaluate) | logging (`DEBUG`) | | `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | ## Migrating from v2.x {#migrating} diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index f33340063..4ba0112b6 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -121,10 +121,10 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' The entity visualizer lets you customize the following `options`: -| Argument | Description | -| -------- | -------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight (`None` for all types). Defaults to `None`. ~~Optional[List[str]]~~ | `None` | -| `colors` | Color overrides. Entity types in uppercase should be mapped to color names or values. Defaults to `{}`. ~~Dict[str, str]~~ | +| Argument | Description | +| -------- | ------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight (`None` for all types). Defaults to `None`. ~~Optional[List[str]]~~ | `None` | +| `colors` | Color overrides. Entity types should be mapped to color names or values. Defaults to `{}`. ~~Dict[str, str]~~ | If you specify a list of `ents`, only those entity types will be rendered – for example, you can choose to display `PERSON` entities. Internally, the visualizer diff --git a/website/src/components/link.js b/website/src/components/link.js index 3644479c5..acded7d0d 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -6,7 +6,7 @@ import classNames from 'classnames' import Icon from './icon' import classes from '../styles/link.module.sass' -import { isString } from './util' +import { isString, isImage } from './util' const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io|explosion.ai|course.spacy.io)/gi @@ -39,7 +39,7 @@ export default function Link({ const dest = to || href const external = forceExternal || /(http(s?)):\/\//gi.test(dest) const icon = getIcon(dest) - const withIcon = !hidden && !hideIcon && !!icon + const withIcon = !hidden && !hideIcon && !!icon && !isImage(children) const sourceWithText = withIcon && isString(children) const linkClassNames = classNames(classes.root, className, { [classes.hidden]: hidden, diff --git a/website/src/components/util.js b/website/src/components/util.js index 844f2c133..a9c6efcf5 100644 --- a/website/src/components/util.js +++ b/website/src/components/util.js @@ -46,6 +46,17 @@ export function isString(obj) { return typeof obj === 'string' || obj instanceof String } +/** + * @param obj - The object to check. + * @returns {boolean} – Whether the object is an image + */ +export function isImage(obj) { + if (!obj || !React.isValidElement(obj)) { + return false + } + return obj.props.name == 'img' || obj.props.className == 'gatsby-resp-image-wrapper' +} + /** * @param obj - The object to check. * @returns {boolean} - Whether the object is empty. diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index 775523190..b71eccd80 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -363,7 +363,7 @@ body [id]:target color: var(--color-red-medium) background: var(--color-red-transparent) - &.italic + &.italic, &.comment font-style: italic @@ -384,9 +384,11 @@ body [id]:target // Settings for ini syntax (config files) [class*="language-ini"] color: var(--syntax-comment) + font-style: italic !important .token color: var(--color-subtle) + font-style: normal !important .gatsby-highlight-code-line @@ -424,6 +426,7 @@ body [id]:target .cm-comment color: var(--syntax-comment) + font-style: italic .cm-keyword color: var(--syntax-keyword) diff --git a/website/src/widgets/quickstart-training-generator.js b/website/src/widgets/quickstart-training-generator.js index f70aedc8c..e69de29bb 100644 --- a/website/src/widgets/quickstart-training-generator.js +++ b/website/src/widgets/quickstart-training-generator.js @@ -1,12 +0,0 @@ -// This file was auto-generated by jinja_to_js.py based on quickstart_training.jinja -import jinjaToJS from "jinja-to-js";export default function templateQuickstartTraining(ctx) { - var __result = ""; - var __tmp; - var __runtime = jinjaToJS.runtime; - var __filters = jinjaToJS.filters; - var __globals = jinjaToJS.globals; - var context = jinjaToJS.createContext(ctx); - var use_transformer = context.transformer_data && context.hardware!=="cpu";var transformer = (use_transformer ? context.transformer_data[context.optimize] : {});__result += "[paths]\ntrain = \"\"\ndev = \"\"\n\n[system]\nuse_pytorch_for_gpu_memory = ";__result += "" + __runtime.escape((__tmp = ((use_transformer ? "true" : "false"))) == null ? "" : __tmp);__result += "\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"";var full_pipeline = [(use_transformer ? "transformer" : "tok2vec")].concat(context.components);__result += "\npipeline = ";__result += "" + ((__tmp = (JSON.stringify(full_pipeline).split("'").join("\""))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n";if(__runtime.boolean(use_transformer)){__result += "[components.transformer]\nfactory = \"transformer\"\n\n[components.transformer.model]\n@architectures = \"spacy-transformers.TransformerModel.v1\"\nname = \"";__result += "" + __runtime.escape((__tmp = (transformer["name"])) == null ? "" : __tmp);__result += "\"\ntokenizer_config = {\"use_fast\": true}\n\n[components.transformer.model.get_spans]\n@span_getters = \"strided_spans.v1\"\nwindow = 128\nstride = 96\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.tagger.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = false\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.parser.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 3\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = false\nnO = null\n\n[components.ner.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.ner.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"\n";}__result += "\n";} else {if(context.hardware==="gpu"){__result += "# There are no recommended transformer weights available for language '";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "'\n# yet, so the pipeline described here is not transformer-based.";}__result += "\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode.width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode.width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode.width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "\n[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.ner.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode.width}\n";}__result += "\n";}__result += "\n\n";__runtime.each(context.components,function(pipe){var __$0 = context.pipe;context.pipe = pipe;__result += "\n";if(!["tagger","parser","ner"].includes(pipe)){__result += "\n";__result += "\n[components.";__result += "" + __runtime.escape((__tmp = (pipe)) == null ? "" : __tmp);__result += "]\nfactory = \"";__result += "" + __runtime.escape((__tmp = (pipe)) == null ? "" : __tmp);__result += "\"\n";}__result += "\n";context.pipe = __$0;});__result += "\n\n[training]\n";if(__runtime.boolean(use_transformer) || context.optimize==="efficiency" || !__runtime.boolean(context.word_vectors)){__result += "vectors = null\n";} else {__result += "vectors = \"";__result += "" + __runtime.escape((__tmp = (context.word_vectors)) == null ? "" : __tmp);__result += "\"\n";}if(__runtime.boolean(use_transformer)){__result += "accumulate_gradient = ";__result += "" + __runtime.escape((__tmp = (transformer["size_factor"])) == null ? "" : __tmp);__result += "\n";}__result += "\n\n[training.optimizer]\n@optimizers = \"Adam.v1\"\n\n[training.optimizer.learn_rate]\n@schedules = \"warmup_linear.v1\"\nwarmup_steps = 250\ntotal_steps = 20000\ninitial_rate = 5e-5\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths.train}\nmax_length = ";__result += "" + __runtime.escape((__tmp = ((context.hardware==="gpu" ? 500 : 0))) == null ? "" : __tmp);__result += "\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths.dev}\nmax_length = 0\n\n";if(__runtime.boolean(use_transformer)){__result += "\n[training.batcher]\n@batchers = \"batch_by_padded.v1\"\ndiscard_oversize = true\nsize = 2000\nbuffer = 256";} else {__result += "\n[training.batcher]\n@batchers = \"batch_by_words.v1\"\ndiscard_oversize = false\ntolerance = 0.2\n\n[training.batcher.size]\n@schedules = \"compounding.v1\"\nstart = 100\nstop = 1000\ncompound = 1.001\n";}__result += "\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";} - return __result; -} -export const DATA = {"en":{"word_vectors":"en_vectors_web_lg","transformer":{"efficiency":{"name":"roberta-base","size_factor":3},"accuracy":{"name":"roberta-base","size_factor":3}}},"de":{"word_vectors":null,"transformer":{"efficiency":{"name":"bert-base-german-cased","size_factor":3},"accuracy":{"name":"bert-base-german-cased","size_factor":3}}},"fr":{"word_vectors":null,"transformer":{"efficiency":{"name":"camembert-base","size_factor":3},"accuracy":{"name":"camembert-base","size_factor":3}}},"es":{"word_vectors":null,"transformer":{"efficiency":{"name":"mrm8488/RuPERTa-base","size_factor":3},"accuracy":{"name":"mrm8488/RuPERTa-base","size_factor":3}}},"sv":{"word_vectors":null,"transformer":{"efficiency":{"name":"KB/bert-base-swedish-cased","size_factor":3},"accuracy":{"name":"KB/bert-base-swedish-cased","size_factor":3}}},"fi":{"word_vectors":null,"transformer":{"efficiency":{"name":"TurkuNLP/bert-base-finnish-cased-v1","size_factor":3},"accuracy":{"name":"TurkuNLP/bert-base-finnish-cased-v1","size_factor":3}}},"el":{"word_vectors":null,"transformer":{"efficiency":{"name":"nlpaueb/bert-base-greek-uncased-v1","size_factor":3},"accuracy":{"name":"nlpaueb/bert-base-greek-uncased-v1","size_factor":3}}},"tr":{"word_vectors":null,"transformer":{"efficiency":{"name":"dbmdz/bert-base-turkish-cased","size_factor":3},"accuracy":{"name":"dbmdz/bert-base-turkish-cased","size_factor":3}}},"zh":{"word_vectors":null,"transformer":{"efficiency":{"name":"bert-base-chinese","size_factor":3},"accuracy":{"name":"bert-base-chinese","size_factor":3}},"has_letters":false},"ar":{"word_vectors":null,"transformer":{"efficiency":{"name":"asafaya/bert-base-arabic","size_factor":3},"accuracy":{"name":"asafaya/bert-base-arabic","size_factor":3}}},"pl":{"word_vectors":null,"transformer":{"efficiency":{"name":"dkleczek/bert-base-polish-cased-v1","size_factor":3},"accuracy":{"name":"dkleczek/bert-base-polish-cased-v1","size_factor":3}}}} \ No newline at end of file