spaCy/spacy/lang/en/__init__.py

# coding: utf8
from __future__ import unicode_literals

from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS

from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language
from ...attrs import LANG
from ...util import update_exc


def _return_en(_):
    return "en"


class EnglishDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters.update(LEX_ATTRS)
    lex_attr_getters[LANG] = _return_en
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    tag_map = TAG_MAP
    stop_words = STOP_WORDS
    morph_rules = MORPH_RULES
    syntax_iterators = SYNTAX_ITERATORS
    single_orth_variants = [
        {"tags": ["NFP"], "variants": ["…", "..."]},
        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
    ]
    paired_orth_variants = [
        {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
    ]

    @classmethod
    def is_base_form(cls, univ_pos, morphology=None):
        """
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.

        univ_pos (unicode / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
        if morphology is None:
            morphology = {}
        if univ_pos == "noun" and morphology.get("Number") == "sing":
            return True
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == "verb" and (
            morphology.get("VerbForm") == "fin"
            and morphology.get("Tense") == "pres"
            and morphology.get("Number") is None
        ):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
            return True
        elif morphology.get("VerbForm") == "inf":
            return True
        elif morphology.get("VerbForm") == "none":
            return True
        elif morphology.get("Degree") == "pos":
            return True
        else:
            return False


class English(Language):
    lang = "en"
    Defaults = EnglishDefaults


__all__ = ["English"]
-												Use consistent unicode declarations

											
										
										
											2017-03-12 12:07:28 +00:00
+								# coding: utf8
-												Fix formatting and remove unused imports

											
										
										
											2017-03-15 16:33:39 +00:00
+								from __future__ import unicode_literals
-												* Begin refactor

											
										
										
											2015-07-07 12:00:07 +00:00
-												Reorganise English language data

											
										
										
											2017-05-08 13:47:25 +00:00
+								from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 								from .tag_map import TAG_MAP
 								from .stop_words import STOP_WORDS
-												Add English lex_attrs overrides

											
										
										
											2017-05-08 23:09:52 +00:00
+								from .lex_attrs import LEX_ATTRS
-												Reorganise English language data

											
										
										
											2017-05-08 13:47:25 +00:00
+								from .morph_rules import MORPH_RULES
-												Add language-specific syntax iterators to en and de

											
										
										
											2017-05-17 09:37:48 +00:00
+								from .syntax_iterators import SYNTAX_ITERATORS
-												Reorganise English language data

											
										
										
											2017-05-08 13:47:25 +00:00
-												Fix relative imports

											
										
										
											2017-05-08 20:29:04 +00:00
+								from ..tokenizer_exceptions import BASE_EXCEPTIONS
 								from ...language import Language
-												Reduce stored lexemes data, move feats to lookups (#5238)

* Reduce stored lexemes data, move feats to lookups

* Move non-derivable lexemes features (`norm / cluster / prob`) to
`spacy-lookups-data` as lookups
  * Get/set `norm` in both lookups and `LexemeC`, serialize in lookups
  * Remove `cluster` and `prob` from `LexemesC`, get/set/serialize in
    lookups only
* Remove serialization of lexemes data as `vocab/lexemes.bin`
  * Remove `SerializedLexemeC`
  * Remove `Lexeme.to_bytes/from_bytes`
* Modify normalization exception loading:
  * Always create `Vocab.lookups` table `lexeme_norm` for
    normalization exceptions
  * Load base exceptions from `lang.norm_exceptions`, but load
    language-specific exceptions from lookups
  * Set `lex_attr_getter[NORM]` including new lookups table in
    `BaseDefaults.create_vocab()` and when deserializing `Vocab`
* Remove all cached lexemes when deserializing vocab to override
  existing normalizations with the new normalizations (as a replacement
  for the previous step that replaced all lexemes data with the
  deserialized data)

* Skip English normalization test

Skip English normalization test because the data is now in
`spacy-lookups-data`.

* Remove norm exceptions

Moved to spacy-lookups-data.

* Move norm exceptions test to spacy-lookups-data

* Load extra lookups from spacy-lookups-data lazily

Load extra lookups (currently for cluster and prob) lazily from the
entry point `lg_extra` as `Vocab.lookups_extra`.

* Skip creating lexeme cache on load

To improve model loading times, do not create the full lexeme cache when
loading. The lexemes will be created on demand when processing.

* Identify numeric values in Lexeme.set_attrs()

With the removal of a special case for `PROB`, also identify `float` to
avoid trying to convert it with the `StringStore`.

* Skip lexeme cache init in from_bytes

* Unskip and update lookups tests for python3.6+

* Update vocab pickle to include lookups_extra

* Update vocab serialization tests

Check strings rather than lexemes since lexemes aren't initialized
automatically, account for addition of "_SP".

* Re-skip lookups test because of python3.5

* Skip PROB/float values in Lexeme.set_attrs

* Convert is_oov from lexeme flag to lex in vectors

Instead of storing `is_oov` as a lexeme flag, `is_oov` reports whether
the lexeme has a vector.

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
											
										
										
											2020-05-19 13:59:14 +00:00
+								from ...attrs import LANG
 								from ...util import update_exc
-												* Restore the LOCAL_DATA_DIR global in spacy/en/__init__.py, although this is now deprecated

											
										
										
											2016-01-19 01:54:56 +00:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
-												Make lambda func a named function, for pickling

											
										
										
											2017-10-17 16:21:20 +00:00
+								def _return_en(_):
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    return "en"
-												Fix formatting and remove unused imports

											
										
										
											2017-03-15 16:33:39 +00:00
-												Move EnglishDefaults class out of English

											
										
										
											2017-05-20 07:18:19 +00:00
+								class EnglishDefaults(Language.Defaults):
 								    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-												Reorder setting of lex attrs, to avoid clobbering

											
										
										
											2017-06-03 19:47:55 +00:00
+								    lex_attr_getters.update(LEX_ATTRS)
-												Make lambda func a named function, for pickling

											
										
										
											2017-10-17 16:21:20 +00:00
+								    lex_attr_getters[LANG] = _return_en
-												Move EnglishDefaults class out of English

											
										
										
											2017-05-20 07:18:19 +00:00
+								    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-												Don't make copies of language data components

											
										
										
											2017-10-11 13:34:55 +00:00
+								    tag_map = TAG_MAP
 								    stop_words = STOP_WORDS
 								    morph_rules = MORPH_RULES
 								    syntax_iterators = SYNTAX_ITERATORS
-												Tidy up and auto-format

											
										
										
											2019-09-11 12:00:36 +00:00
+								    single_orth_variants = [
 								        {"tags": ["NFP"], "variants": ["…", "..."]},
 								        {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
 								    ]
 								    paired_orth_variants = [
 								        {"tags": ["``", "''"], "variants": [("'", "'"), ("‘", "’")]},
 								        {"tags": ["``", "''"], "variants": [('"', '"'), ("“", "”")]},
 								    ]
-												Move EnglishDefaults class out of English

											
										
										
											2017-05-20 07:18:19 +00:00
-												Fix lemmatizer is_base_form for python2.7 (#5734)

* Fix lemmatizer init args for python2.7

* Move English is_base_form to a class method

* Skip test pickling PhraseMatcher for python2
											
										
										
											2020-07-09 20:11:24 +00:00
+								    @classmethod
 								    def is_base_form(cls, univ_pos, morphology=None):
 								        """
 								        Check whether we're dealing with an uninflected paradigm, so we can
 								        avoid lemmatization entirely.
 								        univ_pos (unicode / int): The token's universal part-of-speech tag.
 								        morphology (dict): The token's morphological features following the
 								            Universal Dependencies scheme.
 								        """
 								        if morphology is None:
 								            morphology = {}
 								        if univ_pos == "noun" and morphology.get("Number") == "sing":
 								            return True
 								        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
 								            return True
 								        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
 								        # morphology
 								        elif univ_pos == "verb" and (
 								            morphology.get("VerbForm") == "fin"
 								            and morphology.get("Tense") == "pres"
 								            and morphology.get("Number") is None
 								        ):
 								            return True
 								        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
 								            return True
 								        elif morphology.get("VerbForm") == "inf":
 								            return True
 								        elif morphology.get("VerbForm") == "none":
 								            return True
 								        elif morphology.get("Degree") == "pos":
 								            return True
 								        else:
 								            return False
-												Move EnglishDefaults class out of English

											
										
										
											2017-05-20 07:18:19 +00:00
-												* Use language base class

											
										
										
											2015-08-25 13:37:30 +00:00
+								class English(Language):
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								    lang = "en"
-												Move EnglishDefaults class out of English

											
										
										
											2017-05-20 07:18:19 +00:00
+								    Defaults = EnglishDefaults
-												Use lemmatizer in code, not from downloaded model.

											
										
										
											2017-03-15 09:52:50 +00:00
-												Untested fix for issue #684: GloVe vectors hack should be inserted in English, not in spacy.load.

											
										
										
											2016-12-18 21:29:31 +00:00
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
+								__all__ = ["English"]