From 64e142f4606250bf03fa65a899da900a6c712ebd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 16 Jan 2017 14:23:08 +0100 Subject: [PATCH 01/30] Update about.py --- spacy/about.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 03b57a1df..77c33d5f2 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,12 +3,12 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py -__title__ = 'spacy' +__title__ = 'spaCy' __version__ = '1.6.0' -__summary__ = 'Industrial-strength NLP' +__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' -__email__ = 'matt@spacy.io' +__email__ = 'matt@explosion.ai' __license__ = 'MIT' __models__ = { 'en': 'en>=1.1.0,<1.2.0', From df0aeff379f13f893db5abc9f315023f75b89ab7 Mon Sep 17 00:00:00 2001 From: jktong Date: Mon, 16 Jan 2017 09:34:59 -0500 Subject: [PATCH 02/30] Correct typo "chldren" in doc.jade --- website/docs/api/doc.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index fab1dd86b..72fe34f8c 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -51,14 +51,14 @@ p A container for accessing linguistic annotations. +cell dict +cell | A dictionary that allows customisation of properties of - | #[code Token] chldren. + | #[code Token] children. +row +cell #[code user_span_hooks] +cell dict +cell | A dictionary that allows customisation of properties of - | #[code Span] chldren. + | #[code Span] children. +h(2, "init") Doc.__init__ +tag method From 9fa6f9fb403dd1d02526c0ee2ba1e1c1954d6e15 Mon Sep 17 00:00:00 2001 From: Jason Kessler Date: Mon, 16 Jan 2017 13:31:35 -0600 Subject: [PATCH 03/30] Origin of spacy.matcher attributes Make it clear that Matcher attributes live in spacy.matcher.attrs. --- website/docs/usage/rule-based-matching.jade | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index fde9ee4d7..7650e4a03 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -18,7 +18,9 @@ p Here's a minimal example. We first add a pattern that specifies three tokens: p | Once we've added the pattern, we can use the #[code matcher] as a - | callable, to receive a list of #[code (ent_id, start, end)] tuples: + | callable, to receive a list of #[code (ent_id, start, end)] tuples. + | Note that #[code LOWER] and #[code IS_PUNCT] are data attributes + | of #[code Matcher.attrs]. +code. from spacy.matcher import Matcher From 8a615e8961a4c196352422d3cb0ea9c17715f4a1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 16 Jan 2017 20:43:52 +0100 Subject: [PATCH 04/30] Simplify and update pull request template --- .github/PULL_REQUEST_TEMPLATE.md | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a55f98646..a20b52a34 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,27 +1,19 @@ ## Description - + + -## Motivation and Context - - - -## How Has This Been Tested? - - - ## Types of changes -- [ ] Bug fix (non-breaking change fixing an issue) -- [ ] New feature (non-breaking change adding functionality to spaCy) -- [ ] Breaking change (fix or feature causing change to spaCy's existing functionality) -- [ ] Documentation (Addition to documentation of spaCy) +- [ ] **Bug fix** (non-breaking change fixing an issue) +- [ ] **New feature** (non-breaking change adding functionality to spaCy) +- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) +- [ ] **Documentation** (Addition to documentation of spaCy) ## Checklist: -- [ ] My code follows spaCy's code style. - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. - [ ] I have added tests to cover my changes. From b50c499c04acfc9c58768d5a0b86a2820676291e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 16 Jan 2017 20:44:31 +0100 Subject: [PATCH 05/30] Fix consistency --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a20b52a34..e97a7ea16 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,7 +10,7 @@ - [ ] **Bug fix** (non-breaking change fixing an issue) - [ ] **New feature** (non-breaking change adding functionality to spaCy) - [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) -- [ ] **Documentation** (Addition to documentation of spaCy) +- [ ] **Documentation** (addition to documentation of spaCy) ## Checklist: From fb482ff049532d9b0e66eca51e32d64c2e1a424b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 16 Jan 2017 21:30:23 +0100 Subject: [PATCH 06/30] Fix typo --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2138bc5dd..9120c885f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -76,7 +76,7 @@ Next, create a test file named `test_issue[ISSUE NUMBER].py` in the [`spacy/test ## Adding tests -spaCy uses [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/spacy/tests/tokenizer`](spacy/tests/tokenizer). To be interpreted and run, all test files and test functions need to be prefixed with `test_`. +spaCy uses the [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/spacy/tests/tokenizer`](spacy/tests/tokenizer). To be interpreted and run, all test files and test functions need to be prefixed with `test_`. When adding tests, make sure to use descriptive names, keep the code short and concise and only test for one behaviour at a time. Try to `parametrize` test cases wherever possible, use our pre-defined fixtures for spaCy components and avoid unnecessary imports. From d704cfa60dc42cf8588cd51e2c4ea608519c806c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 16 Jan 2017 21:30:33 +0100 Subject: [PATCH 07/30] Fix typo --- spacy/tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/README.md b/spacy/tests/README.md index 7c73c79f9..489335153 100644 --- a/spacy/tests/README.md +++ b/spacy/tests/README.md @@ -2,7 +2,7 @@ # spaCy tests -spaCy uses [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). +spaCy uses the [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/tests/tokenizer`](tokenizer). All test modules (i.e. directories) also need to be listed in spaCy's [`setup.py`](../setup.py). To be interpreted and run, all test files and test functions need to be prefixed with `test_`. From 7e36568d5b8aeaf2c77e4643a793fdc13cb9ba51 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Jan 2017 00:51:09 +0100 Subject: [PATCH 08/30] Fix title to accommodate sputnik --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 77c33d5f2..d51dea286 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py -__title__ = 'spaCy' +__title__ = 'spacy' __version__ = '1.6.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' From ee456193078eb240601f78975331282ce99c90fc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Jan 2017 17:55:59 +0100 Subject: [PATCH 09/30] Fix formatting --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index 04a66f772..e086f9149 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -56,7 +56,7 @@ "V_CSS": "1.14", "V_JS": "1.0", - "DEFAULT_SYNTAX" : "python", + "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", "MAILCHIMP": { "user": "spacy.us12", From dbe8dafb52c538e34fb5eb5838ce1dcb2a8f1eb5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 17 Jan 2017 17:56:34 +0100 Subject: [PATCH 10/30] Fix logo width and height to avoid link overlap in Safari (resolves #748) --- website/_harp.json | 2 +- website/_includes/_mixins-base.jade | 2 +- website/assets/css/_base/_objects.sass | 2 +- website/assets/css/_variables.sass | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index e086f9149..f69191c88 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -54,7 +54,7 @@ } }, - "V_CSS": "1.14", + "V_CSS": "1.15", "V_JS": "1.0", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index bc8b85557..ea4d2964d 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -113,7 +113,7 @@ mixin gitter(button, label) //- Logo mixin logo() - +svg("graphics", "spacy", 500).o-logo&attributes(attributes) + +svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes) //- Landing diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index 7aaaef787..1be4b17d5 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -83,7 +83,7 @@ //- Logo .o-logo - @include size($logo-width, auto) + @include size($logo-width, $logo-height) fill: currentColor vertical-align: middle margin: 0 0.5rem diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index 6844517d7..9029161e2 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -11,6 +11,7 @@ $aside-width: 30vw $aside-padding: 25px $logo-width: 85px +$logo-height: 27px $grid: ( quarter: 4, third: 3, half: 2, two-thirds: 1.5, three-quarters: 1.33 ) $breakpoints: ( sm: 768px, md: 992px, lg: 1200px ) From 7ec710af0ea0f7ef84ca1fce644a8e8fc6176709 Mon Sep 17 00:00:00 2001 From: Kevin Gao Date: Tue, 17 Jan 2017 10:35:55 -0800 Subject: [PATCH 11/30] Fix Custom Tokenizer docs - Fix mismatched quotations - Make it more clear where ORTH, LEMMA, and POS symbols come from - Make strings consistent - Fix lemma_ assertion s/-PRON-/me/ --- website/docs/usage/customizing-tokenizer.jade | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c1a03a14a..d43fb438f 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -26,6 +26,9 @@ p | #[+api("tokenizer") #[code Tokenizer]] instance: +code. + import spacy + from spacy.symbols import ORTH, LEMMA, POS + nlp = spacy.load('en') assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] nlp.tokenizer.add_special_case(u'gimme', @@ -37,7 +40,7 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -52,9 +55,9 @@ p | The special case rules have precedence over the punctuation splitting: +code. - nlp.tokenizer.add_special_case(u"...gimme...?", + nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}]) + ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p From 7806ebafd2de683aed70457987ba99b25138035a Mon Sep 17 00:00:00 2001 From: Hidekazu Oiwa Date: Tue, 17 Jan 2017 20:37:14 -0800 Subject: [PATCH 12/30] Fix the span doc typo Fix the typo in the span API doc. It explains the `end` of the span as the `start_char` description. --- website/docs/api/span.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index a07ee25d9..770ee3e9b 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -25,7 +25,7 @@ p A slice from a #[code Doc] object. +row +cell #[code start_char] +cell int - +cell The character offset for the end of the span. + +cell The character offset for the start of the span. +row +cell #[code end_char] From be2608527753002cbe7a2b81145140d296ce52dd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 19 Jan 2017 22:03:52 +1100 Subject: [PATCH 13/30] Fix missing import Closes #755 --- spacy/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/download.py b/spacy/download.py index af15e1867..694638149 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -1,6 +1,7 @@ from __future__ import print_function import sys +import shutil import sputnik from sputnik.package_list import (PackageNotFoundException, From 09ecc39b4e278c80cb9333fcd0fe1c61d6b83731 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 20 Jan 2017 15:11:31 +0100 Subject: [PATCH 14/30] Fix multi-line string of NUM_WORDS (resolves #759) --- spacy/orth.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 0f30c1136..1e7d87a7d 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -108,11 +108,12 @@ cpdef bint like_url(unicode string): # TODO: This should live in the language.orth -NUM_WORDS = set('zero one two three four five six seven eight nine ten' - 'eleven twelve thirteen fourteen fifteen sixteen seventeen' - 'eighteen nineteen twenty thirty forty fifty sixty seventy' - 'eighty ninety hundred thousand million billion trillion' - 'quadrillion gajillion bazillion'.split()) +NUM_WORDS = set(''' +zero one two three four five six seven eight nine ten eleven twelve thirteen +fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty +sixty seventy eighty ninety hundred thousand million billion trillion +quadrillion gajillion bazillion +'''.split()) cpdef bint like_number(unicode string): string = string.replace(',', '') string = string.replace('.', '') From 5f6f48e734ca150b2fe94783cd7b5e75419713f3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 20 Jan 2017 15:11:43 +0100 Subject: [PATCH 15/30] Add regression test for #759 --- spacy/tests/regression/test_issue759.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 spacy/tests/regression/test_issue759.py diff --git a/spacy/tests/regression/test_issue759.py b/spacy/tests/regression/test_issue759.py new file mode 100644 index 000000000..b7cf69f1a --- /dev/null +++ b/spacy/tests/regression/test_issue759.py @@ -0,0 +1,12 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True), + ("teneleven", False)]) +def test_issue759(en_tokenizer, text, is_num): + """Test that numbers are recognised correctly.""" + tokens = en_tokenizer(text) + assert tokens[0].like_num == is_num From dce8f5515ee4232af8161abcb9b06ebc5526d9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 23 Jan 2017 18:28:01 +0100 Subject: [PATCH 16/30] Allow zero-width 'infix' token --- spacy/tokenizer.pyx | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e83c4a75..8f2f111e7 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -289,21 +289,18 @@ cdef class Tokenizer: infix_end = match.end() if infix_start == start: continue - if infix_start == infix_end: - msg = ("Tokenizer found a zero-width 'infix' token.\n" - "If you're using a built-in tokenizer, please\n" - "report this bug. If you're using a tokenizer\n" - "you developed, check your TOKENIZER_INFIXES\n" - "tuple.\n" - "String being matched: {string}\n" - "Language: {lang}") - raise ValueError(msg.format(string=string, lang=self.vocab.lang)) span = string[start:infix_start] tokens.push_back(self.vocab.get(tokens.mem, span), False) - - infix_span = string[infix_start:infix_end] - tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + + if infix_start != infix_end: + # If infix_start != infix_end, it means the infix + # token is non-empty. Empty infix tokens are useful + # for tokenization in some languages (see + # https://github.com/explosion/spaCy/issues/768) + infix_span = string[infix_start:infix_end] + tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + start = infix_end span = string[start:] tokens.push_back(self.vocab.get(tokens.mem, span), False) From 0967eb07bea28d84bac696de2c5ea6630424d92a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 23 Jan 2017 21:25:46 +0100 Subject: [PATCH 17/30] Add regression test for #768 --- spacy/tests/regression/test_issue768.py | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 spacy/tests/regression/test_issue768.py diff --git a/spacy/tests/regression/test_issue768.py b/spacy/tests/regression/test_issue768.py new file mode 100644 index 000000000..d8c8be80b --- /dev/null +++ b/spacy/tests/regression/test_issue768.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...language import Language +from ...attrs import LANG +from ...fr.language_data import TOKENIZER_EXCEPTIONS, STOP_WORDS +from ...language_data.punctuation import TOKENIZER_INFIXES, ALPHA + +import pytest + + +@pytest.fixture +def fr_tokenizer_w_infix(): + SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA) + + # create new Language subclass to add to default infixes + class French(Language): + lang = 'fr' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fr' + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + [SPLIT_INFIX] + + return French.Defaults.create_tokenizer() + + +@pytest.mark.parametrize('text,expected_tokens', [("l'avion", ["l'", "avion"]), + ("j'ai", ["j'", "ai"])]) +def test_issue768(fr_tokenizer_w_infix, text, expected_tokens): + """Allow zero-width 'infix' token during the tokenization process.""" + tokens = fr_tokenizer_w_infix(text) + assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens From 55c9c62abc8afb4761d84f85eb76cc1612d8671f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 23 Jan 2017 21:27:49 +0100 Subject: [PATCH 18/30] Use relative import --- spacy/hu/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/hu/__init__.py b/spacy/hu/__init__.py index 652ea379c..a923c68e8 100644 --- a/spacy/hu/__init__.py +++ b/spacy/hu/__init__.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from spacy.hu.tokenizer_exceptions import TOKEN_MATCH +from .tokenizer_exceptions import TOKEN_MATCH from .language_data import * from ..attrs import LANG from ..language import Language From 199ae106909a3d753f8b612327afeeb274afe87e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 23 Jan 2017 21:36:53 +0100 Subject: [PATCH 19/30] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 9c34ed174..abe70e767 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -24,6 +24,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx) * Oleg Zd, [@olegzd](https://github.com/olegzd) * Pokey Rule, [@pokey](https://github.com/pokey) +* Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov [@savkov](https://github.com/savkov) From 902f136f18a807a0b84f7832df868856a20c3d76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 24 Jan 2017 09:47:13 +0100 Subject: [PATCH 20/30] Add support for elision in French --- spacy/fr/__init__.py | 2 ++ spacy/fr/punctuation.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 spacy/fr/punctuation.py diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index 81584b926..33ac0e53a 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -7,6 +7,7 @@ from ..language import Language from ..attrs import LANG from .language_data import * +from .punctuation import TOKENIZER_INFIXES class French(Language): @@ -18,3 +19,4 @@ class French(Language): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS + infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/fr/punctuation.py b/spacy/fr/punctuation.py new file mode 100644 index 000000000..ee4e5a861 --- /dev/null +++ b/spacy/fr/punctuation.py @@ -0,0 +1,16 @@ +# encoding: utf8 + +from __future__ import unicode_literals + +from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES + + +_ELISION = " ' ’ " +ELISION = _ELISION.strip().replace(' ', '').replace('\n', '') + +TOKENIZER_INFIXES += [ + r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION), +] + + +__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] From cf8474401bea3d407535416923c65b433d28fd40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 24 Jan 2017 09:47:26 +0100 Subject: [PATCH 21/30] Remove unused import statement --- spacy/fr/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index 33ac0e53a..c119997b5 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG From 1faaf698ca3fc33eb4bf8fc9e8ae87d4ec582486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 24 Jan 2017 09:51:29 +0100 Subject: [PATCH 22/30] Add infixes and abbreviation exceptions (fr) --- spacy/fr/language_data.py | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index bbbeb1535..6eff8364f 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -4,6 +4,9 @@ from __future__ import unicode_literals from .. import language_data as base from ..language_data import strings_to_exc, update_exc +from .punctuation import ELISION + +from ..symbols import * from .stop_words import STOP_WORDS @@ -13,5 +16,53 @@ STOP_WORDS = set(STOP_WORDS) TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +ABBREVIATIONS = { + "janv.": [ + {LEMMA: "janvier", ORTH: "janv."} + ], + "févr.": [ + {LEMMA: "février", ORTH: "févr."} + ], + "avr.": [ + {LEMMA: "avril", ORTH: "avr."} + ], + "juill.": [ + {LEMMA: "juillet", ORTH: "juill."} + ], + "sept.": [ + {LEMMA: "septembre", ORTH: "sept."} + ], + "oct.": [ + {LEMMA: "octobre", ORTH: "oct."} + ], + "nov.": [ + {LEMMA: "novembre", ORTH: "nov."} + ], + "déc.": [ + {LEMMA: "décembre", ORTH: "déc."} + ], +} + + +INFIXES_EXCEPTIONS_BASE = ["aujourd'hui", + "prud'homme", "prud'hommes", + "prud'homal", "prud'homaux", "prud'homale", + "prud'homales", + "prud'hommal", "prud'hommaux", "prud'hommale", + "prud'hommales", + "prud'homie", "prud'homies", + "prud'hommesque", "prud'hommesques", + "prud'hommesquement"] + +INFIXES_EXCEPTIONS = [] +for elision_char in ELISION: + INFIXES_EXCEPTIONS += [infix.replace("'", elision_char) + for infix in INFIXES_EXCEPTIONS_BASE] + +INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS] + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS)) +update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS) + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] From 1be9c0e724401dc2ff1b3f39534bb3c43a72544d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 24 Jan 2017 10:55:02 +0100 Subject: [PATCH 23/30] Add fr tokenization unit tests --- spacy/tests/conftest.py | 5 +++++ spacy/tests/fr/__init__.py | 1 + spacy/tests/fr/test_exceptions.py | 30 ++++++++++++++++++++++++++++++ spacy/tests/fr/test_text.py | 19 +++++++++++++++++++ 4 files changed, 55 insertions(+) create mode 100644 spacy/tests/fr/__init__.py create mode 100644 spacy/tests/fr/test_exceptions.py create mode 100644 spacy/tests/fr/test_text.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 72e93cf59..de7ecae9b 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -52,6 +52,11 @@ def de_tokenizer(): return German.Defaults.create_tokenizer() +@pytest.fixture +def fr_tokenizer(): + return French.Defaults.create_tokenizer() + + @pytest.fixture def hu_tokenizer(): return Hungarian.Defaults.create_tokenizer() diff --git a/spacy/tests/fr/__init__.py b/spacy/tests/fr/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/spacy/tests/fr/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/spacy/tests/fr/test_exceptions.py b/spacy/tests/fr/test_exceptions.py new file mode 100644 index 000000000..c633a9381 --- /dev/null +++ b/spacy/tests/fr/test_exceptions.py @@ -0,0 +1,30 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["aujourd'hui", "Aujourd'hui", "prud'hommes", + "prud’hommal"]) +def test_tokenizer_infix_exceptions(fr_tokenizer, text): + tokens = fr_tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('text,lemma', [("janv.", "janvier"), + ("juill.", "juillet"), + ("sept.", "septembre")]) +def test_tokenizer_handles_abbr(fr_tokenizer, text, lemma): + tokens = fr_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].lemma_ == lemma + + +def test_tokenizer_handles_exc_in_text(fr_tokenizer): + text = "Je suis allé au mois de janv. aux prud’hommes." + tokens = fr_tokenizer(text) + assert len(tokens) == 10 + assert tokens[6].text == "janv." + assert tokens[6].lemma_ == "janvier" + assert tokens[8].text == "prud’hommes" diff --git a/spacy/tests/fr/test_text.py b/spacy/tests/fr/test_text.py new file mode 100644 index 000000000..350fdae70 --- /dev/null +++ b/spacy/tests/fr/test_text.py @@ -0,0 +1,19 @@ +# encoding: utf8 + + +from __future__ import unicode_literals + + +def test_tokenizer_handles_long_text(fr_tokenizer): + text = """L'histoire du TAL commence dans les années 1950, bien que l'on puisse \ +trouver des travaux antérieurs. En 1950, Alan Turing éditait un article \ +célèbre sous le titre « Computing machinery and intelligence » qui propose ce \ +qu'on appelle à présent le test de Turing comme critère d'intelligence. \ +Ce critère dépend de la capacité d'un programme informatique de personnifier \ +un humain dans une conversation écrite en temps réel, de façon suffisamment \ +convaincante que l'interlocuteur humain ne peut distinguer sûrement — sur la \ +base du seul contenu de la conversation — s'il interagit avec un programme \ +ou avec un autre vrai humain.""" + + tokens = fr_tokenizer(text) + assert len(tokens) == 113 From a3c92e1bf623afc033ccd13ec596834b639cbd38 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Jan 2017 10:48:09 +0100 Subject: [PATCH 24/30] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index becec99ee..aa46cdad1 100644 --- a/README.rst +++ b/README.rst @@ -54,7 +54,7 @@ released under the MIT license. | **Usage questions**   | `StackOverflow `_, `Reddit usergroup                     | | | `_, `Gitter chat `_ | +---------------------------+------------------------------------------------------------------------------------------------------------+ -| **General discussion** |  `Reddit usergroup `_, | +| **General discussion** | `Reddit usergroup `_, | | | `Gitter chat `_  | +---------------------------+------------------------------------------------------------------------------------------------------------+ | **Commercial support** | contact@explosion.ai                                                                                     | From 209c37bbcf291f9c971640852392a877d435ee96 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Jan 2017 13:15:02 +0100 Subject: [PATCH 25/30] Exclude "shell" and "Shell" from English tokenizer exceptions (resolves #775) --- spacy/en/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 36bb0d7f0..c562b38ee 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -7,7 +7,7 @@ from ..language_data import PRON_LEMMA EXC = {} -EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "were", "Were", "Well", "well", "Whore", "whore"] +EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", "were", "Were", "Well", "well", "Whore", "whore"] # Pronouns From 19501f3340127c7c874e551632c36e19ba2176d3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 25 Jan 2017 13:16:52 +0100 Subject: [PATCH 26/30] Add regression test for #775 --- spacy/tests/regression/test_issue775.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 spacy/tests/regression/test_issue775.py diff --git a/spacy/tests/regression/test_issue775.py b/spacy/tests/regression/test_issue775.py new file mode 100644 index 000000000..fe1c89240 --- /dev/null +++ b/spacy/tests/regression/test_issue775.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["Shell", "shell"]) +def test_issue775(en_tokenizer, text): + """Test that 'Shell' and 'shell' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text From bdafb514c51d3c6aee0ad3ab5ac757ee5da8418e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:47:32 +0100 Subject: [PATCH 27/30] Update version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index f69191c88..a273cac0a 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.5", + "SPACY_VERSION": "1.6", "LATEST_NEWS": { "url": "https://explosion.ai/blog/spacy-user-survey", "title": "The results of the spaCy user survey" From baa6be8180eb1897ae1ddbbe0d93ad1614e646b7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:47:45 +0100 Subject: [PATCH 28/30] Update latest news to last blog post --- website/_harp.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index a273cac0a..e315d658c 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -14,8 +14,8 @@ "SPACY_VERSION": "1.6", "LATEST_NEWS": { - "url": "https://explosion.ai/blog/spacy-user-survey", - "title": "The results of the spaCy user survey" + "url": "https://explosion.ai/blog/deep-learning-formula-nlp", + "title": "The new deep learning formula for state-of-the-art NLP models" }, "SOCIAL": { From da3aca4020826d54befa6fc20c296631089c6368 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:48:29 +0100 Subject: [PATCH 29/30] Fix formatting --- website/docs/usage/_data.json | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 932abc99e..8bf5bfc98 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -232,7 +232,7 @@ "NLP with spaCy in 10 lines of code": { "url": "https://github.com/cytora/pycon-nlp-in-10-lines", "author": "Andraz Hribernik et al. (Cytora)", - "tags": [ "jupyter" ] + "tags": ["jupyter"] }, "Intro to NLP with spaCy": { "url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/", @@ -241,7 +241,7 @@ "NLP with spaCy and IPython Notebook": { "url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/", "author": "Dustin Miller (SharePoint)", - "tags": [ "jupyter" ] + "tags": ["jupyter"] }, "Getting Started with spaCy": { "url": "http://textminingonline.com/getting-started-with-spacy", @@ -254,7 +254,7 @@ "NLP (almost) From Scratch - POS Network with spaCy": { "url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html", "author": "Sujit Pal", - "tags": [ "gensim", "keras" ] + "tags": ["gensim", "keras"] }, "NLP tasks with various libraries": { "url": "http://clarkgrubb.com/nlp", @@ -270,44 +270,43 @@ "Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": { "url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb", "author": "Patrick Harrison (S&P Global)", - "tags": [ "jupyter", "gensim" ] + "tags": ["jupyter", "gensim"] }, - "Deep Learning with custom pipelines and Keras": { "url": "https://explosion.ai/blog/spacy-deep-learning-keras", "author": "Matthew Honnibal", - "tags": [ "keras", "sentiment" ] + "tags": ["keras", "sentiment"] }, "A decomposable attention model for Natural Language Inference": { "url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment", "author": "Matthew Honnibal", - "tags": [ "keras", "similarity" ] + "tags": ["keras", "similarity"] }, "Using the German model": { "url": "https://explosion.ai/blog/german-model", "author": "Wolfgang Seeker", - "tags": [ "multi-lingual" ] + "tags": ["multi-lingual"] }, "Sense2vec with spaCy and Gensim": { "url": "https://explosion.ai/blog/sense2vec-with-spacy", "author": "Matthew Honnibal", - "tags": [ "big data", "gensim" ] + "tags": ["big data", "gensim"] }, "Building your bot's brain with Node.js and spaCy": { "url": "https://explosion.ai/blog/chatbot-node-js-spacy", "author": "Wah Loon Keng", - "tags": [ "bots", "node.js" ] + "tags": ["bots", "node.js"] }, "An intent classifier with spaCy": { "url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/", "author": "Musio", - "tags": [ "bots", "keras" ] + "tags": ["bots", "keras"] }, "Visual Question Answering with spaCy": { "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook", "author": "Aaditya Prakash", - "tags": [ "vqa", "keras" ] + "tags": ["vqa", "keras"] } }, @@ -315,22 +314,22 @@ "Information extraction": { "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", "author": "Matthew Honnibal", - "tags": [ "snippet" ] + "tags": ["snippet"] }, "Neural bag of words": { "url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py", "author": "Matthew Honnibal", - "tags": [ "sentiment" ] + "tags": ["sentiment"] }, "Part-of-speech tagging": { "url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py", "author": "Matthew Honnibal", - "tags": [ "pos" ] + "tags": ["pos"] }, "Parallel parse": { "url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py", "author": "Matthew Honnibal", - "tags": [ "big data" ] + "tags": ["big data"] }, "Inventory count": { "url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count", @@ -339,8 +338,8 @@ "Multi-word matches": { "url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py", "author": "Matthew Honnibal", - "tags": [ "matcher", "out of date" ] + "tags": ["matcher", "out of date"] } } } -} +} \ No newline at end of file From 651bf411e0db70c6a25c009f255922db2303a2f0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 26 Jan 2017 13:48:38 +0100 Subject: [PATCH 30/30] Add tutorial --- website/docs/usage/_data.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 8bf5bfc98..9681cb6ea 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -307,6 +307,11 @@ "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook", "author": "Aaditya Prakash", "tags": ["vqa", "keras"] + }, + "Extracting time suggestions from emails with spaCy": { + "url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2", + "author": "Chris Savvopoulos", + "tags": ["ner"] } },