2017-04-15 10:05:47 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import absolute_import, unicode_literals
|
2015-08-27 07:16:11 +00:00
|
|
|
|
2017-05-25 01:10:54 +00:00
|
|
|
import random
|
2017-05-29 11:42:55 +00:00
|
|
|
import ujson
|
2017-07-25 16:57:59 +00:00
|
|
|
import itertools
|
2017-10-16 17:22:40 +00:00
|
|
|
import weakref
|
2017-10-17 16:18:10 +00:00
|
|
|
import functools
|
2017-10-27 19:07:59 +00:00
|
|
|
from collections import OrderedDict
|
|
|
|
from contextlib import contextmanager
|
|
|
|
from copy import copy
|
|
|
|
from thinc.neural import Model
|
2017-05-18 09:25:19 +00:00
|
|
|
|
2015-08-26 17:16:09 +00:00
|
|
|
from .tokenizer import Tokenizer
|
|
|
|
from .vocab import Vocab
|
2016-09-25 13:37:33 +00:00
|
|
|
from .lemmatizer import Lemmatizer
|
2017-10-27 19:07:59 +00:00
|
|
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
2017-11-05 17:45:57 +00:00
|
|
|
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
2018-03-27 17:23:02 +00:00
|
|
|
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
2018-07-18 17:43:16 +00:00
|
|
|
from .pipeline import EntityRuler
|
2017-11-06 21:07:38 +00:00
|
|
|
from .compat import json_dumps, izip, basestring_
|
|
|
|
from .gold import GoldParse
|
2017-10-06 22:26:05 +00:00
|
|
|
from .scorer import Scorer
|
2017-11-06 14:06:27 +00:00
|
|
|
from ._ml import link_vectors_to_models, create_default_optimizer
|
2017-04-15 10:05:47 +00:00
|
|
|
from .attrs import IS_STOP
|
2017-10-27 12:40:14 +00:00
|
|
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
|
|
|
from .lang.punctuation import TOKENIZER_INFIXES
|
2017-05-08 21:58:31 +00:00
|
|
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
|
|
|
from .lang.tag_map import TAG_MAP
|
2017-10-17 16:18:10 +00:00
|
|
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
2018-05-22 16:29:45 +00:00
|
|
|
from .errors import Errors, Warnings, user_warning
|
2017-04-15 10:05:47 +00:00
|
|
|
from . import util
|
2017-10-06 22:26:05 +00:00
|
|
|
from . import about
|
2016-10-09 10:24:24 +00:00
|
|
|
|
2015-08-27 07:16:11 +00:00
|
|
|
|
2016-09-24 18:26:17 +00:00
|
|
|
class BaseDefaults(object):
|
2016-10-18 14:18:25 +00:00
|
|
|
@classmethod
|
|
|
|
def create_lemmatizer(cls, nlp=None):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return Lemmatizer(
|
|
|
|
cls.lemma_index, cls.lemma_exc, cls.lemma_rules, cls.lemma_lookup
|
|
|
|
)
|
2016-10-18 14:18:25 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def create_vocab(cls, nlp=None):
|
|
|
|
lemmatizer = cls.create_lemmatizer(nlp)
|
2017-05-16 09:21:59 +00:00
|
|
|
lex_attr_getters = dict(cls.lex_attr_getters)
|
|
|
|
# This is messy, but it's the minimal working fix to Issue #639.
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
lex_attr_getters[IS_STOP] = functools.partial(is_stop, stops=cls.stop_words)
|
|
|
|
vocab = Vocab(
|
|
|
|
lex_attr_getters=lex_attr_getters,
|
|
|
|
tag_map=cls.tag_map,
|
|
|
|
lemmatizer=lemmatizer,
|
|
|
|
)
|
2017-03-15 14:24:40 +00:00
|
|
|
for tag_str, exc in cls.morph_rules.items():
|
|
|
|
for orth_str, attrs in exc.items():
|
|
|
|
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
|
|
|
|
return vocab
|
2016-12-18 15:54:52 +00:00
|
|
|
|
2016-10-18 14:18:25 +00:00
|
|
|
@classmethod
|
|
|
|
def create_tokenizer(cls, nlp=None):
|
|
|
|
rules = cls.tokenizer_exceptions
|
2017-05-16 09:21:59 +00:00
|
|
|
token_match = cls.token_match
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
prefix_search = (
|
|
|
|
util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
|
|
|
|
)
|
|
|
|
suffix_search = (
|
|
|
|
util.compile_suffix_regex(cls.suffixes).search if cls.suffixes else None
|
|
|
|
)
|
|
|
|
infix_finditer = (
|
|
|
|
util.compile_infix_regex(cls.infixes).finditer if cls.infixes else None
|
|
|
|
)
|
2016-10-18 14:18:25 +00:00
|
|
|
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return Tokenizer(
|
|
|
|
vocab,
|
|
|
|
rules=rules,
|
|
|
|
prefix_search=prefix_search,
|
|
|
|
suffix_search=suffix_search,
|
|
|
|
infix_finditer=infix_finditer,
|
|
|
|
token_match=token_match,
|
|
|
|
)
|
|
|
|
|
|
|
|
pipe_names = ["tagger", "parser", "ner"]
|
2017-05-08 21:58:31 +00:00
|
|
|
token_match = TOKEN_MATCH
|
|
|
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
|
|
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
|
|
|
infixes = tuple(TOKENIZER_INFIXES)
|
|
|
|
tag_map = dict(TAG_MAP)
|
2016-10-09 10:24:24 +00:00
|
|
|
tokenizer_exceptions = {}
|
2016-09-24 18:26:17 +00:00
|
|
|
stop_words = set()
|
2016-12-18 14:50:09 +00:00
|
|
|
lemma_rules = {}
|
2017-03-15 09:52:50 +00:00
|
|
|
lemma_exc = {}
|
|
|
|
lemma_index = {}
|
2017-10-11 11:26:05 +00:00
|
|
|
lemma_lookup = {}
|
2017-03-15 14:24:40 +00:00
|
|
|
morph_rules = {}
|
2017-05-08 22:58:10 +00:00
|
|
|
lex_attr_getters = LEX_ATTRS
|
2017-06-04 19:53:39 +00:00
|
|
|
syntax_iterators = {}
|
2015-09-14 07:48:51 +00:00
|
|
|
|
2015-08-26 17:16:09 +00:00
|
|
|
|
2016-09-24 12:08:53 +00:00
|
|
|
class Language(object):
|
2017-05-18 21:57:38 +00:00
|
|
|
"""A text-processing pipeline. Usually you'll load this once per process,
|
|
|
|
and pass the instance around your application.
|
2017-05-19 16:47:24 +00:00
|
|
|
|
|
|
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
|
|
|
object and processing pipeline.
|
|
|
|
lang (unicode): Two-letter language ID, i.e. ISO code.
|
2017-04-15 09:59:21 +00:00
|
|
|
"""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
|
2016-09-24 18:26:17 +00:00
|
|
|
Defaults = BaseDefaults
|
2016-09-24 12:08:53 +00:00
|
|
|
lang = None
|
2015-08-25 13:37:17 +00:00
|
|
|
|
2017-10-06 22:25:54 +00:00
|
|
|
factories = {
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
|
|
|
"tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
|
|
|
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
|
|
|
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
|
|
|
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
|
|
|
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
|
|
|
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
|
|
|
"sbd": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
|
|
|
"sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
|
|
|
"merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
|
|
|
|
"merge_entities": lambda nlp, **cfg: merge_entities,
|
|
|
|
"merge_subtokens": lambda nlp, **cfg: merge_subtokens,
|
|
|
|
"entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
|
2017-10-06 22:25:54 +00:00
|
|
|
}
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
def __init__(
|
|
|
|
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs
|
|
|
|
):
|
2017-05-18 21:57:38 +00:00
|
|
|
"""Initialise a Language object.
|
|
|
|
|
|
|
|
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
|
|
|
`Language.Defaults.create_vocab`.
|
2017-05-21 11:17:40 +00:00
|
|
|
make_doc (callable): A function that takes text and returns a `Doc`
|
2017-05-18 21:57:38 +00:00
|
|
|
object. Usually a `Tokenizer`.
|
|
|
|
meta (dict): Custom meta data for the Language class. Is written to by
|
|
|
|
models to add model meta data.
|
2018-03-29 19:45:26 +00:00
|
|
|
max_length (int) :
|
|
|
|
Maximum number of characters in a single text. The current v2 models
|
|
|
|
may run out memory on extremely long texts, due to large internal
|
|
|
|
allocations. You should segment these texts into meaningful units,
|
|
|
|
e.g. paragraphs, subsections etc, before passing them to spaCy.
|
|
|
|
Default maximum length is 1,000,000 characters (1mb). As a rule of
|
|
|
|
thumb, if all pipeline components are enabled, spaCy's default
|
|
|
|
models currently requires roughly 1GB of temporary memory per
|
|
|
|
100,000 characters in one text.
|
2017-05-18 21:57:38 +00:00
|
|
|
RETURNS (Language): The newly constructed object.
|
|
|
|
"""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
user_factories = util.get_entry_points("spacy_factories")
|
2018-05-22 16:29:45 +00:00
|
|
|
for factory in user_factories.keys():
|
|
|
|
if factory in self.factories:
|
|
|
|
user_warning(Warnings.W009.format(name=factory))
|
|
|
|
self.factories.update(user_factories)
|
2017-07-22 22:50:18 +00:00
|
|
|
self._meta = dict(meta)
|
2017-10-25 09:57:43 +00:00
|
|
|
self._path = None
|
2017-05-16 09:21:59 +00:00
|
|
|
if vocab is True:
|
|
|
|
factory = self.Defaults.create_vocab
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
vocab = factory(self, **meta.get("vocab", {}))
|
2018-03-28 14:02:59 +00:00
|
|
|
if vocab.vectors.name is None:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
vocab.vectors.name = meta.get("vectors", {}).get("name")
|
2017-05-16 09:21:59 +00:00
|
|
|
self.vocab = vocab
|
|
|
|
if make_doc is True:
|
|
|
|
factory = self.Defaults.create_tokenizer
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
make_doc = factory(self, **meta.get("tokenizer", {}))
|
2017-05-29 13:40:45 +00:00
|
|
|
self.tokenizer = make_doc
|
2017-10-06 22:25:54 +00:00
|
|
|
self.pipeline = []
|
2018-03-29 19:45:26 +00:00
|
|
|
self.max_length = max_length
|
2017-08-20 12:42:07 +00:00
|
|
|
self._optimizer = None
|
2015-10-12 08:33:11 +00:00
|
|
|
|
2017-10-25 09:57:43 +00:00
|
|
|
@property
|
|
|
|
def path(self):
|
|
|
|
return self._path
|
|
|
|
|
2017-07-22 22:50:18 +00:00
|
|
|
@property
|
|
|
|
def meta(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
self._meta.setdefault("lang", self.vocab.lang)
|
|
|
|
self._meta.setdefault("name", "model")
|
|
|
|
self._meta.setdefault("version", "0.0.0")
|
|
|
|
self._meta.setdefault("spacy_version", ">={}".format(about.__version__))
|
|
|
|
self._meta.setdefault("description", "")
|
|
|
|
self._meta.setdefault("author", "")
|
|
|
|
self._meta.setdefault("email", "")
|
|
|
|
self._meta.setdefault("url", "")
|
|
|
|
self._meta.setdefault("license", "")
|
|
|
|
self._meta["vectors"] = {
|
|
|
|
"width": self.vocab.vectors_length,
|
|
|
|
"vectors": len(self.vocab.vectors),
|
|
|
|
"keys": self.vocab.vectors.n_keys,
|
|
|
|
"name": self.vocab.vectors.name,
|
|
|
|
}
|
|
|
|
self._meta["pipeline"] = self.pipe_names
|
2017-07-22 22:50:18 +00:00
|
|
|
return self._meta
|
|
|
|
|
|
|
|
@meta.setter
|
|
|
|
def meta(self, value):
|
|
|
|
self._meta = value
|
|
|
|
|
2017-06-04 20:52:09 +00:00
|
|
|
# Conveniences to access pipeline components
|
|
|
|
@property
|
|
|
|
def tensorizer(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return self.get_pipe("tensorizer")
|
2017-06-04 20:52:09 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def tagger(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return self.get_pipe("tagger")
|
2017-06-04 20:52:09 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def parser(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return self.get_pipe("parser")
|
2017-06-04 20:52:09 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def entity(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return self.get_pipe("ner")
|
2017-06-04 20:52:09 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def matcher(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
return self.get_pipe("matcher")
|
2017-10-06 22:25:54 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def pipe_names(self):
|
|
|
|
"""Get names of available pipeline components.
|
|
|
|
|
|
|
|
RETURNS (list): List of component name strings, in order.
|
|
|
|
"""
|
|
|
|
return [pipe_name for pipe_name, _ in self.pipeline]
|
|
|
|
|
|
|
|
def get_pipe(self, name):
|
|
|
|
"""Get a pipeline component for a given component name.
|
|
|
|
|
|
|
|
name (unicode): Name of pipeline component to get.
|
|
|
|
RETURNS (callable): The pipeline component.
|
|
|
|
"""
|
|
|
|
for pipe_name, component in self.pipeline:
|
|
|
|
if pipe_name == name:
|
|
|
|
return component
|
2018-04-03 13:50:31 +00:00
|
|
|
raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))
|
2017-10-06 22:25:54 +00:00
|
|
|
|
|
|
|
def create_pipe(self, name, config=dict()):
|
|
|
|
"""Create a pipeline component from a factory.
|
|
|
|
|
|
|
|
name (unicode): Factory name to look up in `Language.factories`.
|
2017-10-06 23:04:50 +00:00
|
|
|
config (dict): Configuration parameters to initialise component.
|
2017-10-06 22:25:54 +00:00
|
|
|
RETURNS (callable): Pipeline component.
|
|
|
|
"""
|
|
|
|
if name not in self.factories:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise KeyError(Errors.E002.format(name=name))
|
2017-10-06 22:25:54 +00:00
|
|
|
factory = self.factories[name]
|
|
|
|
return factory(self, **config)
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
def add_pipe(
|
|
|
|
self, component, name=None, before=None, after=None, first=None, last=None
|
|
|
|
):
|
2017-10-06 22:25:54 +00:00
|
|
|
"""Add a component to the processing pipeline. Valid components are
|
2017-10-27 12:40:14 +00:00
|
|
|
callables that take a `Doc` object, modify it and return it. Only one
|
|
|
|
of before/after/first/last can be set. Default behaviour is "last".
|
2017-10-06 22:25:54 +00:00
|
|
|
|
|
|
|
component (callable): The pipeline component.
|
|
|
|
name (unicode): Name of pipeline component. Overwrites existing
|
|
|
|
component.name attribute if available. If no name is set and
|
|
|
|
the component exposes no name attribute, component.__name__ is
|
2017-10-27 12:40:14 +00:00
|
|
|
used. An error is raised if a name already exists in the pipeline.
|
2017-10-06 22:25:54 +00:00
|
|
|
before (unicode): Component name to insert component directly before.
|
|
|
|
after (unicode): Component name to insert component directly after.
|
|
|
|
first (bool): Insert component first / not first in the pipeline.
|
|
|
|
last (bool): Insert component last / not last in the pipeline.
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
>>> nlp.add_pipe(component, before='ner')
|
|
|
|
>>> nlp.add_pipe(component, name='custom_name', last=True)
|
|
|
|
"""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(component, "__call__"):
|
2018-04-03 13:50:31 +00:00
|
|
|
msg = Errors.E003.format(component=repr(component), name=name)
|
2018-01-30 15:29:07 +00:00
|
|
|
if isinstance(component, basestring_) and component in self.factories:
|
2018-04-03 13:50:31 +00:00
|
|
|
msg += Errors.E004.format(component=component)
|
2018-01-30 14:43:03 +00:00
|
|
|
raise ValueError(msg)
|
2017-10-06 22:25:54 +00:00
|
|
|
if name is None:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if hasattr(component, "name"):
|
2017-10-10 02:23:05 +00:00
|
|
|
name = component.name
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
elif hasattr(component, "__name__"):
|
2017-10-10 02:23:05 +00:00
|
|
|
name = component.__name__
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
elif hasattr(component, "__class__") and hasattr(
|
|
|
|
component.__class__, "__name__"
|
|
|
|
):
|
2017-10-10 02:23:05 +00:00
|
|
|
name = component.__class__.__name__
|
|
|
|
else:
|
|
|
|
name = repr(component)
|
2017-10-06 22:25:54 +00:00
|
|
|
if name in self.pipe_names:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
|
2017-10-06 22:25:54 +00:00
|
|
|
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E006)
|
2017-10-06 22:25:54 +00:00
|
|
|
pipe = (name, component)
|
|
|
|
if last or not any([first, before, after]):
|
|
|
|
self.pipeline.append(pipe)
|
|
|
|
elif first:
|
|
|
|
self.pipeline.insert(0, pipe)
|
|
|
|
elif before and before in self.pipe_names:
|
|
|
|
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
|
|
|
elif after and after in self.pipe_names:
|
2017-11-28 19:37:55 +00:00
|
|
|
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
|
2017-10-06 22:25:54 +00:00
|
|
|
else:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
raise ValueError(
|
|
|
|
Errors.E001.format(name=before or after, opts=self.pipe_names)
|
|
|
|
)
|
2017-06-04 20:52:09 +00:00
|
|
|
|
2017-10-17 09:20:07 +00:00
|
|
|
def has_pipe(self, name):
|
|
|
|
"""Check if a component name is present in the pipeline. Equivalent to
|
|
|
|
`name in nlp.pipe_names`.
|
|
|
|
|
|
|
|
name (unicode): Name of the component.
|
2017-10-27 12:40:14 +00:00
|
|
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
2017-10-17 09:20:07 +00:00
|
|
|
"""
|
|
|
|
return name in self.pipe_names
|
|
|
|
|
2017-10-06 22:25:54 +00:00
|
|
|
def replace_pipe(self, name, component):
|
|
|
|
"""Replace a component in the pipeline.
|
|
|
|
|
|
|
|
name (unicode): Name of the component to replace.
|
|
|
|
component (callable): Pipeline component.
|
|
|
|
"""
|
|
|
|
if name not in self.pipe_names:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
2017-10-06 22:25:54 +00:00
|
|
|
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
|
|
|
|
|
|
|
def rename_pipe(self, old_name, new_name):
|
|
|
|
"""Rename a pipeline component.
|
|
|
|
|
|
|
|
old_name (unicode): Name of the component to rename.
|
|
|
|
new_name (unicode): New name of the component.
|
|
|
|
"""
|
|
|
|
if old_name not in self.pipe_names:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
|
2017-10-06 22:25:54 +00:00
|
|
|
if new_name in self.pipe_names:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
|
2017-10-06 22:25:54 +00:00
|
|
|
i = self.pipe_names.index(old_name)
|
|
|
|
self.pipeline[i] = (new_name, self.pipeline[i][1])
|
|
|
|
|
|
|
|
def remove_pipe(self, name):
|
|
|
|
"""Remove a component from the pipeline.
|
|
|
|
|
|
|
|
name (unicode): Name of the component to remove.
|
2017-10-06 23:04:50 +00:00
|
|
|
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
2017-10-06 22:25:54 +00:00
|
|
|
"""
|
|
|
|
if name not in self.pipe_names:
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
2017-10-06 22:25:54 +00:00
|
|
|
return self.pipeline.pop(self.pipe_names.index(name))
|
2017-06-04 20:52:09 +00:00
|
|
|
|
2017-05-26 10:33:54 +00:00
|
|
|
def __call__(self, text, disable=[]):
|
2017-10-06 22:26:05 +00:00
|
|
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
2017-05-18 21:57:38 +00:00
|
|
|
and can contain arbtrary whitespace. Alignment into the original string
|
2015-08-25 13:37:17 +00:00
|
|
|
is preserved.
|
2016-12-18 15:54:52 +00:00
|
|
|
|
2017-05-18 21:57:38 +00:00
|
|
|
text (unicode): The text to be processed.
|
2017-05-26 10:33:54 +00:00
|
|
|
disable (list): Names of the pipeline components to disable.
|
2017-05-18 21:57:38 +00:00
|
|
|
RETURNS (Doc): A container for accessing the annotations.
|
2016-11-01 11:25:36 +00:00
|
|
|
|
2017-05-18 21:57:38 +00:00
|
|
|
EXAMPLE:
|
2016-11-01 11:25:36 +00:00
|
|
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
2017-05-18 21:57:38 +00:00
|
|
|
>>> tokens[0].text, tokens[0].head.tag_
|
2016-11-01 11:25:36 +00:00
|
|
|
('An', 'NN')
|
2015-08-25 13:37:17 +00:00
|
|
|
"""
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 15:30:29 +00:00
|
|
|
if len(text) > self.max_length:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
raise ValueError(
|
|
|
|
Errors.E088.format(length=len(text), max_length=self.max_length)
|
|
|
|
)
|
2016-10-14 15:38:29 +00:00
|
|
|
doc = self.make_doc(text)
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in self.pipeline:
|
2017-05-26 10:33:54 +00:00
|
|
|
if name in disable:
|
2017-05-16 09:21:59 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "__call__"):
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
2017-05-28 13:11:58 +00:00
|
|
|
doc = proc(doc)
|
2018-04-03 13:50:31 +00:00
|
|
|
if doc is None:
|
|
|
|
raise ValueError(Errors.E005.format(name=name))
|
2016-05-17 14:55:42 +00:00
|
|
|
return doc
|
2015-08-25 13:37:17 +00:00
|
|
|
|
2017-10-25 11:46:41 +00:00
|
|
|
def disable_pipes(self, *names):
|
2017-10-27 12:40:14 +00:00
|
|
|
"""Disable one or more pipeline components. If used as a context
|
|
|
|
manager, the pipeline will be restored to the initial state at the end
|
|
|
|
of the block. Otherwise, a DisabledPipes object is returned, that has
|
|
|
|
a `.restore()` method you can use to undo your changes.
|
2017-10-25 11:46:41 +00:00
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
>>> nlp.add_pipe('parser')
|
|
|
|
>>> nlp.add_pipe('tagger')
|
|
|
|
>>> with nlp.disable_pipes('parser', 'tagger'):
|
|
|
|
>>> assert not nlp.has_pipe('parser')
|
|
|
|
>>> assert nlp.has_pipe('parser')
|
|
|
|
>>> disabled = nlp.disable_pipes('parser')
|
|
|
|
>>> assert len(disabled) == 1
|
|
|
|
>>> assert not nlp.has_pipe('parser')
|
|
|
|
>>> disabled.restore()
|
|
|
|
>>> assert nlp.has_pipe('parser')
|
2017-10-27 12:40:14 +00:00
|
|
|
"""
|
2017-10-25 11:46:41 +00:00
|
|
|
return DisabledPipes(self, *names)
|
|
|
|
|
2017-05-29 13:40:45 +00:00
|
|
|
def make_doc(self, text):
|
|
|
|
return self.tokenizer(text)
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
|
2017-05-18 21:57:38 +00:00
|
|
|
"""Update the models in the pipeline.
|
|
|
|
|
|
|
|
docs (iterable): A batch of `Doc` objects.
|
|
|
|
golds (iterable): A batch of `GoldParse` objects.
|
|
|
|
drop (float): The droput rate.
|
2017-05-21 11:17:40 +00:00
|
|
|
sgd (callable): An optimizer.
|
2017-05-18 21:57:38 +00:00
|
|
|
RETURNS (dict): Results from the update.
|
|
|
|
|
|
|
|
EXAMPLE:
|
2017-10-27 12:40:14 +00:00
|
|
|
>>> with nlp.begin_training(gold) as (trainer, optimizer):
|
2017-05-18 21:57:38 +00:00
|
|
|
>>> for epoch in trainer.epochs(gold):
|
|
|
|
>>> for docs, golds in epoch:
|
|
|
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
|
|
|
"""
|
2017-08-01 20:10:17 +00:00
|
|
|
if len(docs) != len(golds):
|
2018-04-03 13:50:31 +00:00
|
|
|
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
|
2017-08-01 20:10:17 +00:00
|
|
|
if len(docs) == 0:
|
|
|
|
return
|
2017-08-20 12:42:07 +00:00
|
|
|
if sgd is None:
|
|
|
|
if self._optimizer is None:
|
2017-11-06 21:07:38 +00:00
|
|
|
self._optimizer = create_default_optimizer(Model.ops)
|
2017-08-20 12:42:07 +00:00
|
|
|
sgd = self._optimizer
|
2017-11-06 21:07:38 +00:00
|
|
|
|
|
|
|
# Allow dict of args to GoldParse, instead of GoldParse objects.
|
|
|
|
gold_objs = []
|
|
|
|
doc_objs = []
|
|
|
|
for doc, gold in zip(docs, golds):
|
|
|
|
if isinstance(doc, basestring_):
|
|
|
|
doc = self.make_doc(doc)
|
|
|
|
if not isinstance(gold, GoldParse):
|
|
|
|
gold = GoldParse(doc, **gold)
|
|
|
|
doc_objs.append(doc)
|
|
|
|
gold_objs.append(gold)
|
|
|
|
golds = gold_objs
|
|
|
|
docs = doc_objs
|
2017-05-25 01:10:54 +00:00
|
|
|
grads = {}
|
2017-10-27 12:40:14 +00:00
|
|
|
|
2017-05-25 01:10:54 +00:00
|
|
|
def get_grads(W, dW, key=None):
|
|
|
|
grads[key] = (W, dW)
|
2017-10-27 12:40:14 +00:00
|
|
|
|
2018-09-13 22:51:52 +00:00
|
|
|
get_grads.alpha = sgd.alpha
|
|
|
|
get_grads.b1 = sgd.b1
|
|
|
|
get_grads.b2 = sgd.b2
|
|
|
|
|
2017-09-21 12:59:48 +00:00
|
|
|
pipes = list(self.pipeline)
|
2017-05-27 23:32:21 +00:00
|
|
|
random.shuffle(pipes)
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in pipes:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "update"):
|
2017-05-21 23:43:31 +00:00
|
|
|
continue
|
2017-11-03 19:20:01 +00:00
|
|
|
grads = {}
|
2017-09-21 12:59:48 +00:00
|
|
|
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
2017-11-03 19:20:01 +00:00
|
|
|
for key, (W, dW) in grads.items():
|
|
|
|
sgd(W, dW, key=key)
|
2017-05-16 14:17:30 +00:00
|
|
|
|
2017-05-21 14:07:06 +00:00
|
|
|
def preprocess_gold(self, docs_golds):
|
2017-05-22 10:29:30 +00:00
|
|
|
"""Can be called before training to pre-process gold data. By default,
|
|
|
|
it handles nonprojectivity and adds missing tags to the tag map.
|
|
|
|
|
|
|
|
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
|
|
|
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
|
|
|
"""
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in self.pipeline:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if hasattr(proc, "preprocess_gold"):
|
2017-05-21 14:07:06 +00:00
|
|
|
docs_golds = proc.preprocess_gold(docs_golds)
|
|
|
|
for doc, gold in docs_golds:
|
|
|
|
yield doc, gold
|
|
|
|
|
2017-11-06 13:26:00 +00:00
|
|
|
def begin_training(self, get_gold_tuples=None, sgd=None, **cfg):
|
2017-05-18 21:57:38 +00:00
|
|
|
"""Allocate models, pre-process training data and acquire a trainer and
|
|
|
|
optimizer. Used as a contextmanager.
|
|
|
|
|
2017-09-14 14:18:30 +00:00
|
|
|
get_gold_tuples (function): Function returning gold data
|
2017-05-18 21:57:38 +00:00
|
|
|
**cfg: Config parameters.
|
2017-10-06 22:26:05 +00:00
|
|
|
RETURNS: An optimizer
|
2017-05-18 21:57:38 +00:00
|
|
|
"""
|
2017-11-01 12:14:31 +00:00
|
|
|
if get_gold_tuples is None:
|
|
|
|
get_gold_tuples = lambda: []
|
2017-05-17 10:04:50 +00:00
|
|
|
# Populate vocab
|
2017-11-01 12:14:31 +00:00
|
|
|
else:
|
2017-09-21 00:15:20 +00:00
|
|
|
for _, annots_brackets in get_gold_tuples():
|
|
|
|
for annots, _ in annots_brackets:
|
|
|
|
for word in annots[1]:
|
2018-11-30 16:43:08 +00:00
|
|
|
_ = self.vocab[word] # noqa: F841
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if cfg.get("device", -1) >= 0:
|
2018-11-30 16:43:08 +00:00
|
|
|
util.use_gpu(cfg["device"])
|
2017-09-18 23:04:16 +00:00
|
|
|
if self.vocab.vectors.data.shape[1] >= 1:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
|
2017-09-23 01:11:52 +00:00
|
|
|
link_vectors_to_models(self.vocab)
|
2018-03-28 14:02:59 +00:00
|
|
|
if self.vocab.vectors.data.shape[1]:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
cfg["pretrained_vectors"] = self.vocab.vectors.name
|
2017-11-06 13:26:00 +00:00
|
|
|
if sgd is None:
|
|
|
|
sgd = create_default_optimizer(Model.ops)
|
|
|
|
self._optimizer = sgd
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in self.pipeline:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if hasattr(proc, "begin_training"):
|
|
|
|
proc.begin_training(
|
|
|
|
get_gold_tuples, pipeline=self.pipeline, sgd=self._optimizer, **cfg
|
|
|
|
)
|
2017-08-20 12:42:07 +00:00
|
|
|
return self._optimizer
|
2017-05-21 14:07:06 +00:00
|
|
|
|
2017-10-03 14:14:57 +00:00
|
|
|
def evaluate(self, docs_golds, verbose=False):
|
2017-08-14 11:00:23 +00:00
|
|
|
scorer = Scorer()
|
2017-08-18 20:26:12 +00:00
|
|
|
docs, golds = zip(*docs_golds)
|
|
|
|
docs = list(docs)
|
|
|
|
golds = list(golds)
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, pipe in self.pipeline:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(pipe, "pipe"):
|
2017-10-18 19:46:12 +00:00
|
|
|
docs = (pipe(doc) for doc in docs)
|
2017-08-18 20:26:12 +00:00
|
|
|
else:
|
2017-10-18 19:46:12 +00:00
|
|
|
docs = pipe.pipe(docs, batch_size=256)
|
2017-08-18 20:26:12 +00:00
|
|
|
for doc, gold in zip(docs, golds):
|
2017-10-03 14:14:57 +00:00
|
|
|
if verbose:
|
|
|
|
print(doc)
|
|
|
|
scorer.score(doc, gold, verbose=verbose)
|
2017-05-21 14:07:06 +00:00
|
|
|
return scorer
|
2017-05-16 09:21:59 +00:00
|
|
|
|
2017-05-18 09:25:19 +00:00
|
|
|
@contextmanager
|
|
|
|
def use_params(self, params, **cfg):
|
2017-05-18 21:57:38 +00:00
|
|
|
"""Replace weights of models in the pipeline with those provided in the
|
|
|
|
params dictionary. Can be used as a contextmanager, in which case,
|
|
|
|
models go back to their original weights after the block.
|
|
|
|
|
|
|
|
params (dict): A dictionary of parameters keyed by model ID.
|
|
|
|
**cfg: Config parameters.
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
>>> with nlp.use_params(optimizer.averages):
|
|
|
|
>>> nlp.to_disk('/tmp/checkpoint')
|
|
|
|
"""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
contexts = [
|
|
|
|
pipe.use_params(params)
|
|
|
|
for name, pipe in self.pipeline
|
|
|
|
if hasattr(pipe, "use_params")
|
|
|
|
]
|
2017-05-18 13:30:59 +00:00
|
|
|
# TODO: Having trouble with contextlib
|
|
|
|
# Workaround: these aren't actually context managers atm.
|
|
|
|
for context in contexts:
|
|
|
|
try:
|
|
|
|
next(context)
|
|
|
|
except StopIteration:
|
|
|
|
pass
|
2017-05-18 09:25:19 +00:00
|
|
|
yield
|
|
|
|
for context in contexts:
|
|
|
|
try:
|
2017-05-18 13:30:59 +00:00
|
|
|
next(context)
|
2017-05-18 09:25:19 +00:00
|
|
|
except StopIteration:
|
|
|
|
pass
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
def pipe(
|
|
|
|
self,
|
|
|
|
texts,
|
|
|
|
as_tuples=False,
|
|
|
|
n_threads=2,
|
|
|
|
batch_size=1000,
|
|
|
|
disable=[],
|
|
|
|
cleanup=False,
|
|
|
|
):
|
2017-10-27 12:40:14 +00:00
|
|
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
2017-05-18 21:57:38 +00:00
|
|
|
|
|
|
|
texts (iterator): A sequence of texts to process.
|
2017-08-19 10:21:33 +00:00
|
|
|
as_tuples (bool):
|
|
|
|
If set to True, inputs should be a sequence of
|
|
|
|
(text, context) tuples. Output will then be a sequence of
|
|
|
|
(doc, context) tuples. Defaults to False.
|
2017-11-23 12:18:59 +00:00
|
|
|
n_threads (int): Currently inactive.
|
2017-05-18 21:57:38 +00:00
|
|
|
batch_size (int): The number of texts to buffer.
|
2017-05-26 10:33:54 +00:00
|
|
|
disable (list): Names of the pipeline components to disable.
|
2017-11-23 12:18:59 +00:00
|
|
|
cleanup (bool): If True, unneeded strings are freed,
|
|
|
|
to control memory use. Experimental.
|
2017-05-18 21:57:38 +00:00
|
|
|
YIELDS (Doc): Documents in the order of the original text.
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
>>> texts = [u'One document.', u'...', u'Lots of documents']
|
|
|
|
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
|
|
|
>>> assert doc.is_parsed
|
2017-04-15 09:59:21 +00:00
|
|
|
"""
|
2017-08-19 10:21:33 +00:00
|
|
|
if as_tuples:
|
2017-07-25 16:57:59 +00:00
|
|
|
text_context1, text_context2 = itertools.tee(texts)
|
|
|
|
texts = (tc[0] for tc in text_context1)
|
|
|
|
contexts = (tc[1] for tc in text_context2)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
docs = self.pipe(
|
|
|
|
texts, n_threads=n_threads, batch_size=batch_size, disable=disable
|
|
|
|
)
|
2017-07-25 16:57:59 +00:00
|
|
|
for doc, context in izip(docs, contexts):
|
|
|
|
yield (doc, context)
|
|
|
|
return
|
2017-05-23 08:06:53 +00:00
|
|
|
docs = (self.make_doc(text) for text in texts)
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in self.pipeline:
|
2017-05-26 10:33:54 +00:00
|
|
|
if name in disable:
|
2017-05-16 09:21:59 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if hasattr(proc, "pipe"):
|
|
|
|
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
2017-05-16 09:21:59 +00:00
|
|
|
else:
|
2017-05-21 23:43:31 +00:00
|
|
|
# Apply the function, but yield the doc
|
|
|
|
docs = _pipe(proc, docs)
|
2017-10-16 17:22:40 +00:00
|
|
|
# Track weakrefs of "recent" documents, so that we can see when they
|
|
|
|
# expire from memory. When they do, we know we don't need old strings.
|
|
|
|
# This way, we avoid maintaining an unbounded growth in string entries
|
|
|
|
# in the string store.
|
|
|
|
recent_refs = weakref.WeakSet()
|
|
|
|
old_refs = weakref.WeakSet()
|
2017-11-23 12:19:18 +00:00
|
|
|
# Keep track of the original string data, so that if we flush old strings,
|
|
|
|
# we can recover the original ones. However, we only want to do this if we're
|
|
|
|
# really adding strings, to save up-front costs.
|
|
|
|
original_strings_data = None
|
2017-10-16 17:22:40 +00:00
|
|
|
nr_seen = 0
|
2017-05-19 18:25:42 +00:00
|
|
|
for doc in docs:
|
2016-02-03 01:04:55 +00:00
|
|
|
yield doc
|
2017-11-23 12:19:18 +00:00
|
|
|
if cleanup:
|
|
|
|
recent_refs.add(doc)
|
|
|
|
if nr_seen < 10000:
|
|
|
|
old_refs.add(doc)
|
|
|
|
nr_seen += 1
|
|
|
|
elif len(old_refs) == 0:
|
|
|
|
old_refs, recent_refs = recent_refs, old_refs
|
|
|
|
if original_strings_data is None:
|
|
|
|
original_strings_data = list(self.vocab.strings)
|
|
|
|
else:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
keys, strings = self.vocab.strings._cleanup_stale_strings(
|
|
|
|
original_strings_data
|
|
|
|
)
|
2017-11-23 12:19:18 +00:00
|
|
|
self.vocab._reset_cache(keys, strings)
|
|
|
|
self.tokenizer._reset_cache(keys)
|
|
|
|
nr_seen = 0
|
2016-02-01 08:01:13 +00:00
|
|
|
|
2017-05-31 11:42:39 +00:00
|
|
|
def to_disk(self, path, disable=tuple()):
|
2017-05-26 10:33:54 +00:00
|
|
|
"""Save the current state to a directory. If a model is loaded, this
|
|
|
|
will include the model.
|
2017-04-16 23:40:26 +00:00
|
|
|
|
2017-05-18 21:57:38 +00:00
|
|
|
path (unicode or Path): A path to a directory, which will be created if
|
2017-10-27 12:40:14 +00:00
|
|
|
it doesn't exist. Paths may be strings or `Path`-like objects.
|
2017-05-31 11:42:39 +00:00
|
|
|
disable (list): Names of pipeline components to disable and prevent
|
2017-05-26 10:33:54 +00:00
|
|
|
from being saved.
|
2017-05-18 21:57:38 +00:00
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
>>> nlp.to_disk('/path/to/models')
|
2017-05-17 10:04:50 +00:00
|
|
|
"""
|
|
|
|
path = util.ensure_path(path)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
serializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
|
|
|
("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))),
|
|
|
|
)
|
|
|
|
)
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in self.pipeline:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "name"):
|
2017-05-31 11:42:39 +00:00
|
|
|
continue
|
2017-10-06 22:25:54 +00:00
|
|
|
if name in disable:
|
2017-05-31 11:42:39 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "to_disk"):
|
2017-05-31 11:42:39 +00:00
|
|
|
continue
|
2017-10-06 22:25:54 +00:00
|
|
|
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
serializers["vocab"] = lambda p: self.vocab.to_disk(p)
|
2017-05-31 11:42:39 +00:00
|
|
|
util.to_disk(path, serializers, {p: False for p in disable})
|
|
|
|
|
|
|
|
def from_disk(self, path, disable=tuple()):
|
2017-05-18 21:57:38 +00:00
|
|
|
"""Loads state from a directory. Modifies the object in place and
|
2017-05-26 10:33:54 +00:00
|
|
|
returns it. If the saved `Language` object contains a model, the
|
|
|
|
model will be loaded.
|
2017-05-17 10:04:50 +00:00
|
|
|
|
2017-05-18 21:57:38 +00:00
|
|
|
path (unicode or Path): A path to a directory. Paths may be either
|
|
|
|
strings or `Path`-like objects.
|
2017-05-26 10:33:54 +00:00
|
|
|
disable (list): Names of the pipeline components to disable.
|
2017-05-18 21:57:38 +00:00
|
|
|
RETURNS (Language): The modified `Language` object.
|
2017-05-17 10:04:50 +00:00
|
|
|
|
2017-05-18 21:57:38 +00:00
|
|
|
EXAMPLE:
|
|
|
|
>>> from spacy.language import Language
|
|
|
|
>>> nlp = Language().from_disk('/path/to/models')
|
2017-05-17 10:04:50 +00:00
|
|
|
"""
|
|
|
|
path = util.ensure_path(path)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
deserializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("meta.json", lambda p: self.meta.update(util.read_json(p))),
|
|
|
|
(
|
|
|
|
"vocab",
|
|
|
|
lambda p: (
|
|
|
|
self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
|
|
|
|
),
|
|
|
|
),
|
|
|
|
("tokenizer", lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
|
|
|
)
|
|
|
|
)
|
2017-10-06 22:25:54 +00:00
|
|
|
for name, proc in self.pipeline:
|
|
|
|
if name in disable:
|
2017-05-31 11:42:39 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "from_disk"):
|
2017-05-31 11:42:39 +00:00
|
|
|
continue
|
2017-10-06 22:25:54 +00:00
|
|
|
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
2017-06-01 12:38:35 +00:00
|
|
|
exclude = {p: False for p in disable}
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not (path / "vocab").exists():
|
|
|
|
exclude["vocab"] = True
|
2017-06-01 12:38:35 +00:00
|
|
|
util.from_disk(path, deserializers, exclude)
|
2017-10-25 09:57:43 +00:00
|
|
|
self._path = path
|
2017-05-31 11:42:39 +00:00
|
|
|
return self
|
2017-05-17 10:04:50 +00:00
|
|
|
|
2017-10-17 16:18:10 +00:00
|
|
|
def to_bytes(self, disable=[], **exclude):
|
2017-05-17 10:04:50 +00:00
|
|
|
"""Serialize the current state to a binary string.
|
2016-12-18 15:54:52 +00:00
|
|
|
|
2017-05-26 10:33:54 +00:00
|
|
|
disable (list): Nameds of pipeline components to disable and prevent
|
|
|
|
from being serialized.
|
2017-05-18 21:57:38 +00:00
|
|
|
RETURNS (bytes): The serialized form of the `Language` object.
|
2017-05-17 10:04:50 +00:00
|
|
|
"""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
serializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("vocab", lambda: self.vocab.to_bytes()),
|
|
|
|
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
|
|
|
("meta", lambda: json_dumps(self.meta)),
|
|
|
|
)
|
|
|
|
)
|
2017-10-06 22:25:54 +00:00
|
|
|
for i, (name, proc) in enumerate(self.pipeline):
|
|
|
|
if name in disable:
|
2017-05-29 09:45:45 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "to_bytes"):
|
2017-05-29 09:45:45 +00:00
|
|
|
continue
|
2017-05-29 18:23:28 +00:00
|
|
|
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
|
2017-10-17 16:18:10 +00:00
|
|
|
return util.to_bytes(serializers, exclude)
|
2017-04-15 10:05:47 +00:00
|
|
|
|
2017-05-26 10:33:54 +00:00
|
|
|
def from_bytes(self, bytes_data, disable=[]):
|
2017-05-17 10:04:50 +00:00
|
|
|
"""Load state from a binary string.
|
|
|
|
|
2017-05-18 21:57:38 +00:00
|
|
|
bytes_data (bytes): The data to load from.
|
2017-05-26 10:33:54 +00:00
|
|
|
disable (list): Names of the pipeline components to disable.
|
2017-05-18 21:57:38 +00:00
|
|
|
RETURNS (Language): The `Language` object.
|
2017-05-17 10:04:50 +00:00
|
|
|
"""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
deserializers = OrderedDict(
|
|
|
|
(
|
|
|
|
("meta", lambda b: self.meta.update(ujson.loads(b))),
|
|
|
|
(
|
|
|
|
"vocab",
|
|
|
|
lambda b: (
|
|
|
|
self.vocab.from_bytes(b) and _fix_pretrained_vectors_name(self)
|
|
|
|
),
|
|
|
|
),
|
|
|
|
("tokenizer", lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
|
|
|
)
|
|
|
|
)
|
2017-10-06 22:25:54 +00:00
|
|
|
for i, (name, proc) in enumerate(self.pipeline):
|
|
|
|
if name in disable:
|
2017-05-29 09:45:45 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "from_bytes"):
|
2017-05-29 09:45:45 +00:00
|
|
|
continue
|
2017-05-29 18:23:28 +00:00
|
|
|
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
|
2018-11-30 16:43:08 +00:00
|
|
|
util.from_bytes(bytes_data, deserializers, {})
|
2017-05-17 10:04:50 +00:00
|
|
|
return self
|
2017-05-21 23:43:31 +00:00
|
|
|
|
2017-05-29 09:45:45 +00:00
|
|
|
|
2018-03-28 14:02:59 +00:00
|
|
|
def _fix_pretrained_vectors_name(nlp):
|
|
|
|
# TODO: Replace this once we handle vectors consistently as static
|
|
|
|
# data
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if "vectors" in nlp.meta and nlp.meta["vectors"].get("name"):
|
|
|
|
nlp.vocab.vectors.name = nlp.meta["vectors"]["name"]
|
2018-03-28 19:08:58 +00:00
|
|
|
elif not nlp.vocab.vectors.size:
|
|
|
|
nlp.vocab.vectors.name = None
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
elif "name" in nlp.meta and "lang" in nlp.meta:
|
|
|
|
vectors_name = "%s_%s.vectors" % (nlp.meta["lang"], nlp.meta["name"])
|
2018-03-28 14:02:59 +00:00
|
|
|
nlp.vocab.vectors.name = vectors_name
|
|
|
|
else:
|
2018-04-03 19:40:29 +00:00
|
|
|
raise ValueError(Errors.E092)
|
2018-04-03 23:31:25 +00:00
|
|
|
if nlp.vocab.vectors.size != 0:
|
|
|
|
link_vectors_to_models(nlp.vocab)
|
2018-03-28 14:02:59 +00:00
|
|
|
for name, proc in nlp.pipeline:
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
if not hasattr(proc, "cfg"):
|
2018-03-28 14:02:59 +00:00
|
|
|
continue
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
proc.cfg.setdefault("deprecation_fixes", {})
|
|
|
|
proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name
|
2018-03-28 14:02:59 +00:00
|
|
|
|
2017-05-29 09:45:45 +00:00
|
|
|
|
2017-10-25 11:46:41 +00:00
|
|
|
class DisabledPipes(list):
|
2017-10-27 12:40:14 +00:00
|
|
|
"""Manager for temporary pipeline disabling."""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
|
2017-10-25 11:46:41 +00:00
|
|
|
def __init__(self, nlp, *names):
|
|
|
|
self.nlp = nlp
|
|
|
|
self.names = names
|
|
|
|
# Important! Not deep copy -- we just want the container (but we also
|
|
|
|
# want to support people providing arbitrarily typed nlp.pipeline
|
|
|
|
# objects.)
|
2017-10-27 19:07:59 +00:00
|
|
|
self.original_pipeline = copy(nlp.pipeline)
|
2017-10-25 11:46:41 +00:00
|
|
|
list.__init__(self)
|
|
|
|
self.extend(nlp.remove_pipe(name) for name in names)
|
|
|
|
|
|
|
|
def __enter__(self):
|
2017-10-25 12:56:16 +00:00
|
|
|
return self
|
2017-10-25 11:46:41 +00:00
|
|
|
|
|
|
|
def __exit__(self, *args):
|
|
|
|
self.restore()
|
|
|
|
|
|
|
|
def restore(self):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
"""Restore the pipeline to its state when DisabledPipes was created."""
|
2017-10-25 11:46:41 +00:00
|
|
|
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 16:03:03 +00:00
|
|
|
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
|
2017-10-25 11:46:41 +00:00
|
|
|
if unexpected:
|
|
|
|
# Don't change the pipeline if we're raising an error.
|
|
|
|
self.nlp.pipeline = current
|
2018-04-03 13:50:31 +00:00
|
|
|
raise ValueError(Errors.E008.format(names=unexpected))
|
2017-10-25 11:46:41 +00:00
|
|
|
self[:] = []
|
|
|
|
|
|
|
|
|
2017-05-21 23:43:31 +00:00
|
|
|
def _pipe(func, docs):
|
|
|
|
for doc in docs:
|
2018-02-15 20:51:49 +00:00
|
|
|
doc = func(doc)
|
2017-05-21 23:43:31 +00:00
|
|
|
yield doc
|