2018-07-24 21:38:44 +00:00
|
|
|
|
import pytest
|
|
|
|
|
import random
|
2020-07-20 12:49:54 +00:00
|
|
|
|
|
|
|
|
|
from spacy import util
|
|
|
|
|
from spacy.gold import Example
|
2018-07-24 21:38:44 +00:00
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
|
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
2020-01-23 21:01:54 +00:00
|
|
|
|
from spacy.symbols import POS, VERB
|
2018-07-24 21:38:44 +00:00
|
|
|
|
from spacy.vocab import Vocab
|
2020-07-20 12:49:54 +00:00
|
|
|
|
from spacy.lang.en import English
|
2018-07-24 21:38:44 +00:00
|
|
|
|
from spacy.lemmatizer import Lemmatizer
|
2019-10-01 19:36:04 +00:00
|
|
|
|
from spacy.lookups import Lookups
|
2019-02-15 09:29:44 +00:00
|
|
|
|
from spacy.tokens import Doc, Span
|
2020-07-22 11:42:59 +00:00
|
|
|
|
from spacy.lang.en.lemmatizer import is_base_form
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
from ..util import get_doc, make_tempdir
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"patterns",
|
|
|
|
|
[
|
|
|
|
|
[[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
|
|
|
|
|
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
|
|
|
|
|
],
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue118(en_tokenizer, patterns):
|
|
|
|
|
"""Test a bug that arose from having overlapping matches"""
|
2018-11-27 00:09:36 +00:00
|
|
|
|
text = (
|
|
|
|
|
"how many points did lebron james score against the boston celtics last night"
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
doc = en_tokenizer(text)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
ORG = doc.vocab.strings["ORG"]
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matcher = Matcher(doc.vocab)
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("BostonCeltics", patterns)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
assert len(list(doc.ents)) == 0
|
|
|
|
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
|
|
|
|
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
|
|
|
|
doc.ents = matches[:1]
|
|
|
|
|
ents = list(doc.ents)
|
|
|
|
|
assert len(ents) == 1
|
|
|
|
|
assert ents[0].label == ORG
|
|
|
|
|
assert ents[0].start == 9
|
|
|
|
|
assert ents[0].end == 11
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"patterns",
|
|
|
|
|
[
|
|
|
|
|
[[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
|
|
|
|
|
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
|
|
|
|
|
],
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
|
|
|
|
"""Test a bug that arose from having overlapping matches"""
|
2018-11-27 00:09:36 +00:00
|
|
|
|
text = (
|
|
|
|
|
"how many points did lebron james score against the boston celtics last night"
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
doc = en_tokenizer(text)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
ORG = doc.vocab.strings["ORG"]
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matcher = Matcher(doc.vocab)
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("BostonCeltics", patterns)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
assert len(list(doc.ents)) == 0
|
|
|
|
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
|
|
|
|
doc.ents += tuple(matches)[1:]
|
|
|
|
|
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
|
|
|
|
ents = doc.ents
|
|
|
|
|
assert len(ents) == 1
|
|
|
|
|
assert ents[0].label == ORG
|
|
|
|
|
assert ents[0].start == 9
|
|
|
|
|
assert ents[0].end == 11
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue242(en_tokenizer):
|
|
|
|
|
"""Test overlapping multi-word phrases."""
|
|
|
|
|
text = "There are different food safety standards in different countries."
|
2018-11-27 00:09:36 +00:00
|
|
|
|
patterns = [
|
|
|
|
|
[{"LOWER": "food"}, {"LOWER": "safety"}],
|
|
|
|
|
[{"LOWER": "safety"}, {"LOWER": "standards"}],
|
|
|
|
|
]
|
2018-07-24 21:38:44 +00:00
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
|
matcher = Matcher(doc.vocab)
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("FOOD", patterns)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
|
|
|
|
match1, match2 = matches
|
|
|
|
|
assert match1[1] == 3
|
|
|
|
|
assert match1[2] == 5
|
|
|
|
|
assert match2[1] == 4
|
|
|
|
|
assert match2[2] == 6
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 15:30:29 +00:00
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
# One token can only be part of one entity, so test that the matches
|
|
|
|
|
# can't be added as entities
|
|
|
|
|
doc.ents += tuple(matches)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue309(en_tokenizer):
|
|
|
|
|
"""Test Issue #309: SBD fails on empty string"""
|
|
|
|
|
tokens = en_tokenizer(" ")
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc = get_doc(
|
|
|
|
|
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
doc.is_parsed = True
|
|
|
|
|
assert len(doc) == 1
|
|
|
|
|
sents = list(doc.sents)
|
|
|
|
|
assert len(sents) == 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue351(en_tokenizer):
|
|
|
|
|
doc = en_tokenizer(" This is a cat.")
|
|
|
|
|
assert doc[0].idx == 0
|
|
|
|
|
assert len(doc[0]) == 3
|
|
|
|
|
assert doc[1].idx == 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue360(en_tokenizer):
|
|
|
|
|
"""Test tokenization of big ellipsis"""
|
2018-11-27 00:09:36 +00:00
|
|
|
|
tokens = en_tokenizer("$45...............Asking")
|
2018-07-24 21:38:44 +00:00
|
|
|
|
assert len(tokens) > 2
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue361(en_vocab, text1, text2):
|
|
|
|
|
"""Test Issue #361: Equality of lexemes"""
|
|
|
|
|
assert en_vocab[text1] == en_vocab[text1]
|
|
|
|
|
assert en_vocab[text1] != en_vocab[text2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue587(en_tokenizer):
|
|
|
|
|
"""Test that Matcher doesn't segfault on particular input"""
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc = en_tokenizer("a b; c")
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matcher = Matcher(doc.vocab)
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
assert len(matches) == 1
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
assert len(matches) == 2
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue588(en_vocab):
|
|
|
|
|
matcher = Matcher(en_vocab)
|
|
|
|
|
with pytest.raises(ValueError):
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("TEST", [[]])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue590(en_vocab):
|
|
|
|
|
"""Test overlapping matches"""
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matcher = Matcher(en_vocab)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
matcher.add(
|
2019-10-25 20:21:08 +00:00
|
|
|
|
"ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
|
2018-11-27 00:09:36 +00:00
|
|
|
|
)
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
assert len(matches) == 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue595():
|
|
|
|
|
"""Test lemmatization of base forms"""
|
|
|
|
|
words = ["Do", "n't", "feed", "the", "dog"]
|
2020-01-23 21:01:54 +00:00
|
|
|
|
tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}}
|
2019-10-01 19:36:04 +00:00
|
|
|
|
lookups = Lookups()
|
|
|
|
|
lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]})
|
|
|
|
|
lookups.add_table("lemma_index", {"verb": {}})
|
|
|
|
|
lookups.add_table("lemma_exc", {"verb": {}})
|
2020-07-22 11:42:59 +00:00
|
|
|
|
lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
|
|
|
|
doc = Doc(vocab, words=words)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc[2].tag_ = "VB"
|
|
|
|
|
assert doc[2].text == "feed"
|
|
|
|
|
assert doc[2].lemma_ == "feed"
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue599(en_vocab):
|
|
|
|
|
doc = Doc(en_vocab)
|
|
|
|
|
doc.is_tagged = True
|
|
|
|
|
doc.is_parsed = True
|
|
|
|
|
doc2 = Doc(doc.vocab)
|
|
|
|
|
doc2.from_bytes(doc.to_bytes())
|
|
|
|
|
assert doc2.is_parsed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue600():
|
2018-11-27 00:09:36 +00:00
|
|
|
|
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
|
2018-07-24 21:38:44 +00:00
|
|
|
|
doc = Doc(vocab, words=["hello"])
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc[0].tag_ = "NN"
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue615(en_tokenizer):
|
|
|
|
|
def merge_phrases(matcher, doc, i, matches):
|
|
|
|
|
"""Merge a phrase. We have to be careful here because we'll change the
|
|
|
|
|
token indices. To avoid problems, merge all the phrases once we're called
|
|
|
|
|
on the last match."""
|
2018-11-27 00:09:36 +00:00
|
|
|
|
if i != len(matches) - 1:
|
2018-07-24 21:38:44 +00:00
|
|
|
|
return None
|
2019-02-15 09:29:44 +00:00
|
|
|
|
spans = [Span(doc, start, end, label=label) for label, start, end in matches]
|
|
|
|
|
with doc.retokenize() as retokenizer:
|
|
|
|
|
for span in spans:
|
|
|
|
|
tag = "NNP" if span.label_ else span.root.tag_
|
|
|
|
|
attrs = {"tag": tag, "lemma": span.text}
|
|
|
|
|
retokenizer.merge(span, attrs=attrs)
|
|
|
|
|
doc.ents = doc.ents + (span,)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
text = "The golf club is broken"
|
2018-11-27 00:09:36 +00:00
|
|
|
|
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
|
2018-07-24 21:38:44 +00:00
|
|
|
|
label = "Sport_Equipment"
|
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
|
matcher = Matcher(doc.vocab)
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add(label, [pattern], on_match=merge_phrases)
|
2018-11-30 16:43:08 +00:00
|
|
|
|
matcher(doc)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
entities = list(doc.ents)
|
|
|
|
|
assert entities != []
|
|
|
|
|
assert entities[0].label != 0
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue736(en_tokenizer, text, number):
|
|
|
|
|
"""Test that times like "7am" are tokenized correctly and that numbers are
|
|
|
|
|
converted to string."""
|
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 2
|
|
|
|
|
assert tokens[0].text == number
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue740(en_tokenizer, text):
|
|
|
|
|
"""Test that dates are not split and kept as one token. This behaviour is
|
|
|
|
|
currently inconsistent, since dates separated by hyphens are still split.
|
|
|
|
|
This will be hard to prevent without causing clashes with numeric ranges."""
|
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue743():
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc = Doc(Vocab(), ["hello", "world"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
token = doc[0]
|
|
|
|
|
s = set([token])
|
|
|
|
|
items = list(s)
|
|
|
|
|
assert items[0] is token
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue744(en_tokenizer, text):
|
|
|
|
|
"""Test that 'were' and 'Were' are excluded from the contractions
|
|
|
|
|
generated by the English tokenizer exceptions."""
|
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 3
|
|
|
|
|
assert tokens[1].text.lower() == "were"
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue759(en_tokenizer, text, is_num):
|
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
|
assert tokens[0].like_num == is_num
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue775(en_tokenizer, text):
|
|
|
|
|
"""Test that 'Shell' and 'shell' are excluded from the contractions
|
|
|
|
|
generated by the English tokenizer exceptions."""
|
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 1
|
|
|
|
|
assert tokens[0].text == text
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue792(en_tokenizer, text):
|
|
|
|
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
|
|
|
|
doc = en_tokenizer(text)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
assert "".join([token.text_with_ws for token in doc]) == text
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_control_issue792(en_tokenizer, text):
|
|
|
|
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
|
|
|
|
doc = en_tokenizer(text)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
assert "".join([token.text_with_ws for token in doc]) == text
|
|
|
|
|
|
|
|
|
|
|
2020-07-22 11:42:59 +00:00
|
|
|
|
@pytest.mark.skip(
|
|
|
|
|
reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218"
|
|
|
|
|
)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text,tokens",
|
|
|
|
|
[
|
|
|
|
|
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
|
|
|
|
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
|
|
|
|
("day.--Is", ["day", ".--", "Is"]),
|
|
|
|
|
("refinement:--just", ["refinement", ":--", "just"]),
|
|
|
|
|
("memories?--To", ["memories", "?--", "To"]),
|
|
|
|
|
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
|
|
|
|
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
|
|
|
|
|
],
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue801(en_tokenizer, text, tokens):
|
|
|
|
|
"""Test that special characters + hyphens are split correctly."""
|
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
|
assert len(doc) == len(tokens)
|
|
|
|
|
assert [t.text for t in doc] == tokens
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text,expected_tokens",
|
|
|
|
|
[
|
|
|
|
|
(
|
|
|
|
|
"Smörsåsen används bl.a. till fisk",
|
|
|
|
|
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
|
|
|
|
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue805(sv_tokenizer, text, expected_tokens):
|
|
|
|
|
tokens = sv_tokenizer(text)
|
|
|
|
|
token_list = [token.text for token in tokens if not token.is_space]
|
|
|
|
|
assert expected_tokens == token_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue850():
|
|
|
|
|
"""The variable-length pattern matches the succeeding token. Check we
|
|
|
|
|
handle the ambiguity correctly."""
|
|
|
|
|
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
|
|
|
|
matcher = Matcher(vocab)
|
2018-11-30 16:43:08 +00:00
|
|
|
|
pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("FarAway", [pattern])
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
match = matcher(doc)
|
|
|
|
|
assert len(match) == 1
|
|
|
|
|
ent_id, start, end = match[0]
|
|
|
|
|
assert start == 0
|
|
|
|
|
assert end == 4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue850_basic():
|
|
|
|
|
"""Test Matcher matches with '*' operator and Boolean flag"""
|
|
|
|
|
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
|
|
|
|
matcher = Matcher(vocab)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
|
2019-10-25 20:21:08 +00:00
|
|
|
|
matcher.add("FarAway", [pattern])
|
2018-11-27 00:09:36 +00:00
|
|
|
|
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
match = matcher(doc)
|
|
|
|
|
assert len(match) == 1
|
|
|
|
|
ent_id, start, end = match[0]
|
|
|
|
|
assert start == 0
|
|
|
|
|
assert end == 4
|
|
|
|
|
|
|
|
|
|
|
2019-08-20 15:36:34 +00:00
|
|
|
|
@pytest.mark.skip(
|
|
|
|
|
reason="French exception list is not enabled in the default tokenizer anymore"
|
|
|
|
|
)
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue852(fr_tokenizer, text):
|
|
|
|
|
"""Test that French tokenizer exceptions are imported correctly."""
|
|
|
|
|
tokens = fr_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 1
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue859(en_tokenizer, text):
|
|
|
|
|
"""Test that no extra space is added in doc.text method."""
|
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
|
assert doc.text == text
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue886(en_tokenizer, text):
|
|
|
|
|
"""Test that token.idx matches the original text index for texts with newlines."""
|
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
|
for token in doc:
|
|
|
|
|
assert len(token.text) == len(token.text_with_ws)
|
|
|
|
|
assert text[token.idx] == token.text[0]
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize("text", ["want/need"])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue891(en_tokenizer, text):
|
|
|
|
|
"""Test that / infixes are split correctly."""
|
|
|
|
|
tokens = en_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 3
|
|
|
|
|
assert tokens[1].text == "/"
|
|
|
|
|
|
|
|
|
|
|
2018-11-27 00:09:36 +00:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"text,tag,lemma",
|
|
|
|
|
[("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
|
|
|
|
|
)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue912(en_vocab, text, tag, lemma):
|
|
|
|
|
"""Test base-forms are preserved."""
|
|
|
|
|
doc = Doc(en_vocab, words=[text])
|
|
|
|
|
doc[0].tag_ = tag
|
|
|
|
|
assert doc[0].lemma_ == lemma
|
|
|
|
|
|
|
|
|
|
|
2019-02-01 07:05:22 +00:00
|
|
|
|
@pytest.mark.slow
|
2018-07-24 21:38:44 +00:00
|
|
|
|
def test_issue957(en_tokenizer):
|
2019-02-01 07:05:22 +00:00
|
|
|
|
"""Test that spaCy doesn't hang on many punctuation characters.
|
|
|
|
|
If this test hangs, check (new) regular expressions for conflicting greedy operators
|
|
|
|
|
"""
|
2018-11-30 16:43:08 +00:00
|
|
|
|
# Skip test if pytest-timeout is not installed
|
2019-02-01 07:05:22 +00:00
|
|
|
|
pytest.importorskip("pytest_timeout")
|
2019-02-15 09:29:44 +00:00
|
|
|
|
for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]:
|
2019-02-01 07:05:22 +00:00
|
|
|
|
string = "0"
|
|
|
|
|
for i in range(1, 100):
|
|
|
|
|
string += punct + str(i)
|
|
|
|
|
doc = en_tokenizer(string)
|
|
|
|
|
assert doc
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
|
2020-07-20 12:49:54 +00:00
|
|
|
|
def test_issue999():
|
2018-07-24 21:38:44 +00:00
|
|
|
|
"""Test that adding entities and resuming training works passably OK.
|
|
|
|
|
There are two issues here:
|
2019-10-14 10:28:53 +00:00
|
|
|
|
1) We have to re-add labels. This isn't very nice.
|
2018-07-24 21:38:44 +00:00
|
|
|
|
2) There's no way to set the learning rate for the weight update, so we
|
|
|
|
|
end up out-of-scale, causing it to learn too fast.
|
|
|
|
|
"""
|
|
|
|
|
TRAIN_DATA = [
|
|
|
|
|
["hey", []],
|
|
|
|
|
["howdy", []],
|
|
|
|
|
["hey there", []],
|
|
|
|
|
["hello", []],
|
|
|
|
|
["hi", []],
|
|
|
|
|
["i'm looking for a place to eat", []],
|
2020-07-20 12:49:54 +00:00
|
|
|
|
["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
|
|
|
|
|
["show me chinese restaurants", [(8, 15, "CUISINE")]],
|
|
|
|
|
["show me chines restaurants", [(8, 14, "CUISINE")]],
|
2018-07-24 21:38:44 +00:00
|
|
|
|
]
|
2020-07-20 12:49:54 +00:00
|
|
|
|
nlp = English()
|
2020-07-22 11:42:59 +00:00
|
|
|
|
ner = nlp.add_pipe("ner")
|
2018-07-24 21:38:44 +00:00
|
|
|
|
for _, offsets in TRAIN_DATA:
|
|
|
|
|
for start, end, label in offsets:
|
|
|
|
|
ner.add_label(label)
|
|
|
|
|
nlp.begin_training()
|
2020-07-20 12:49:54 +00:00
|
|
|
|
for itn in range(20):
|
2018-07-24 21:38:44 +00:00
|
|
|
|
random.shuffle(TRAIN_DATA)
|
|
|
|
|
for raw_text, entity_offsets in TRAIN_DATA:
|
2020-07-22 11:42:59 +00:00
|
|
|
|
example = Example.from_dict(
|
|
|
|
|
nlp.make_doc(raw_text), {"entities": entity_offsets}
|
|
|
|
|
)
|
2020-07-20 12:49:54 +00:00
|
|
|
|
nlp.update([example])
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
with make_tempdir() as model_dir:
|
|
|
|
|
nlp.to_disk(model_dir)
|
2020-07-20 12:49:54 +00:00
|
|
|
|
nlp2 = util.load_model_from_path(model_dir)
|
2018-07-24 21:38:44 +00:00
|
|
|
|
|
|
|
|
|
for raw_text, entity_offsets in TRAIN_DATA:
|
|
|
|
|
doc = nlp2(raw_text)
|
|
|
|
|
ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
|
|
|
|
|
for start, end, label in entity_offsets:
|
|
|
|
|
if (start, end) in ents:
|
|
|
|
|
assert ents[(start, end)] == label
|
|
|
|
|
break
|
2020-07-20 12:49:54 +00:00
|
|
|
|
else:
|
|
|
|
|
if entity_offsets:
|
|
|
|
|
raise Exception(ents)
|