diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a55f98646..e97a7ea16 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,27 +1,19 @@ ## Description - + + -## Motivation and Context - - - -## How Has This Been Tested? - - - ## Types of changes -- [ ] Bug fix (non-breaking change fixing an issue) -- [ ] New feature (non-breaking change adding functionality to spaCy) -- [ ] Breaking change (fix or feature causing change to spaCy's existing functionality) -- [ ] Documentation (Addition to documentation of spaCy) +- [ ] **Bug fix** (non-breaking change fixing an issue) +- [ ] **New feature** (non-breaking change adding functionality to spaCy) +- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) +- [ ] **Documentation** (addition to documentation of spaCy) ## Checklist: -- [ ] My code follows spaCy's code style. - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. - [ ] I have added tests to cover my changes. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2138bc5dd..9120c885f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -76,7 +76,7 @@ Next, create a test file named `test_issue[ISSUE NUMBER].py` in the [`spacy/test ## Adding tests -spaCy uses [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/spacy/tests/tokenizer`](spacy/tests/tokenizer). To be interpreted and run, all test files and test functions need to be prefixed with `test_`. +spaCy uses the [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/spacy/tests/tokenizer`](spacy/tests/tokenizer). To be interpreted and run, all test files and test functions need to be prefixed with `test_`. When adding tests, make sure to use descriptive names, keep the code short and concise and only test for one behaviour at a time. Try to `parametrize` test cases wherever possible, use our pre-defined fixtures for spaCy components and avoid unnecessary imports. diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 9c34ed174..abe70e767 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -24,6 +24,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx) * Oleg Zd, [@olegzd](https://github.com/olegzd) * Pokey Rule, [@pokey](https://github.com/pokey) +* Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) * Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort) * Sam Bozek, [@sambozek](https://github.com/sambozek) * Sasho Savkov [@savkov](https://github.com/savkov) diff --git a/README.rst b/README.rst index becec99ee..aa46cdad1 100644 --- a/README.rst +++ b/README.rst @@ -54,7 +54,7 @@ released under the MIT license. | **Usage questions**   | `StackOverflow `_, `Reddit usergroup                     | | | `_, `Gitter chat `_ | +---------------------------+------------------------------------------------------------------------------------------------------------+ -| **General discussion** |  `Reddit usergroup `_, | +| **General discussion** | `Reddit usergroup `_, | | | `Gitter chat `_  | +---------------------------+------------------------------------------------------------------------------------------------------------+ | **Commercial support** | contact@explosion.ai                                                                                     | diff --git a/spacy/about.py b/spacy/about.py index 03b57a1df..d51dea286 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -5,10 +5,10 @@ __title__ = 'spacy' __version__ = '1.6.0' -__summary__ = 'Industrial-strength NLP' +__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' -__email__ = 'matt@spacy.io' +__email__ = 'matt@explosion.ai' __license__ = 'MIT' __models__ = { 'en': 'en>=1.1.0,<1.2.0', diff --git a/spacy/download.py b/spacy/download.py index af15e1867..694638149 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -1,6 +1,7 @@ from __future__ import print_function import sys +import shutil import sputnik from sputnik.package_list import (PackageNotFoundException, diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 36bb0d7f0..c562b38ee 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -7,7 +7,7 @@ from ..language_data import PRON_LEMMA EXC = {} -EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "were", "Were", "Well", "well", "Whore", "whore"] +EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", "were", "Were", "Well", "well", "Whore", "whore"] # Pronouns diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index 81584b926..c119997b5 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -1,12 +1,11 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from os import path - from ..language import Language from ..attrs import LANG from .language_data import * +from .punctuation import TOKENIZER_INFIXES class French(Language): @@ -18,3 +17,4 @@ class French(Language): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS + infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index bbbeb1535..6eff8364f 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -4,6 +4,9 @@ from __future__ import unicode_literals from .. import language_data as base from ..language_data import strings_to_exc, update_exc +from .punctuation import ELISION + +from ..symbols import * from .stop_words import STOP_WORDS @@ -13,5 +16,53 @@ STOP_WORDS = set(STOP_WORDS) TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +ABBREVIATIONS = { + "janv.": [ + {LEMMA: "janvier", ORTH: "janv."} + ], + "févr.": [ + {LEMMA: "février", ORTH: "févr."} + ], + "avr.": [ + {LEMMA: "avril", ORTH: "avr."} + ], + "juill.": [ + {LEMMA: "juillet", ORTH: "juill."} + ], + "sept.": [ + {LEMMA: "septembre", ORTH: "sept."} + ], + "oct.": [ + {LEMMA: "octobre", ORTH: "oct."} + ], + "nov.": [ + {LEMMA: "novembre", ORTH: "nov."} + ], + "déc.": [ + {LEMMA: "décembre", ORTH: "déc."} + ], +} + + +INFIXES_EXCEPTIONS_BASE = ["aujourd'hui", + "prud'homme", "prud'hommes", + "prud'homal", "prud'homaux", "prud'homale", + "prud'homales", + "prud'hommal", "prud'hommaux", "prud'hommale", + "prud'hommales", + "prud'homie", "prud'homies", + "prud'hommesque", "prud'hommesques", + "prud'hommesquement"] + +INFIXES_EXCEPTIONS = [] +for elision_char in ELISION: + INFIXES_EXCEPTIONS += [infix.replace("'", elision_char) + for infix in INFIXES_EXCEPTIONS_BASE] + +INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS] + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS)) +update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS) + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/fr/punctuation.py b/spacy/fr/punctuation.py new file mode 100644 index 000000000..ee4e5a861 --- /dev/null +++ b/spacy/fr/punctuation.py @@ -0,0 +1,16 @@ +# encoding: utf8 + +from __future__ import unicode_literals + +from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES + + +_ELISION = " ' ’ " +ELISION = _ELISION.strip().replace(' ', '').replace('\n', '') + +TOKENIZER_INFIXES += [ + r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION), +] + + +__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/__init__.py b/spacy/hu/__init__.py index 652ea379c..a923c68e8 100644 --- a/spacy/hu/__init__.py +++ b/spacy/hu/__init__.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from spacy.hu.tokenizer_exceptions import TOKEN_MATCH +from .tokenizer_exceptions import TOKEN_MATCH from .language_data import * from ..attrs import LANG from ..language import Language diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 0f30c1136..1e7d87a7d 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -108,11 +108,12 @@ cpdef bint like_url(unicode string): # TODO: This should live in the language.orth -NUM_WORDS = set('zero one two three four five six seven eight nine ten' - 'eleven twelve thirteen fourteen fifteen sixteen seventeen' - 'eighteen nineteen twenty thirty forty fifty sixty seventy' - 'eighty ninety hundred thousand million billion trillion' - 'quadrillion gajillion bazillion'.split()) +NUM_WORDS = set(''' +zero one two three four five six seven eight nine ten eleven twelve thirteen +fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty +sixty seventy eighty ninety hundred thousand million billion trillion +quadrillion gajillion bazillion +'''.split()) cpdef bint like_number(unicode string): string = string.replace(',', '') string = string.replace('.', '') diff --git a/spacy/tests/README.md b/spacy/tests/README.md index 7c73c79f9..489335153 100644 --- a/spacy/tests/README.md +++ b/spacy/tests/README.md @@ -2,7 +2,7 @@ # spaCy tests -spaCy uses [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). +spaCy uses the [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/tests/tokenizer`](tokenizer). All test modules (i.e. directories) also need to be listed in spaCy's [`setup.py`](../setup.py). To be interpreted and run, all test files and test functions need to be prefixed with `test_`. diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 72e93cf59..de7ecae9b 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -52,6 +52,11 @@ def de_tokenizer(): return German.Defaults.create_tokenizer() +@pytest.fixture +def fr_tokenizer(): + return French.Defaults.create_tokenizer() + + @pytest.fixture def hu_tokenizer(): return Hungarian.Defaults.create_tokenizer() diff --git a/spacy/tests/fr/__init__.py b/spacy/tests/fr/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/spacy/tests/fr/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/spacy/tests/fr/test_exceptions.py b/spacy/tests/fr/test_exceptions.py new file mode 100644 index 000000000..c633a9381 --- /dev/null +++ b/spacy/tests/fr/test_exceptions.py @@ -0,0 +1,30 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["aujourd'hui", "Aujourd'hui", "prud'hommes", + "prud’hommal"]) +def test_tokenizer_infix_exceptions(fr_tokenizer, text): + tokens = fr_tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('text,lemma', [("janv.", "janvier"), + ("juill.", "juillet"), + ("sept.", "septembre")]) +def test_tokenizer_handles_abbr(fr_tokenizer, text, lemma): + tokens = fr_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].lemma_ == lemma + + +def test_tokenizer_handles_exc_in_text(fr_tokenizer): + text = "Je suis allé au mois de janv. aux prud’hommes." + tokens = fr_tokenizer(text) + assert len(tokens) == 10 + assert tokens[6].text == "janv." + assert tokens[6].lemma_ == "janvier" + assert tokens[8].text == "prud’hommes" diff --git a/spacy/tests/fr/test_text.py b/spacy/tests/fr/test_text.py new file mode 100644 index 000000000..350fdae70 --- /dev/null +++ b/spacy/tests/fr/test_text.py @@ -0,0 +1,19 @@ +# encoding: utf8 + + +from __future__ import unicode_literals + + +def test_tokenizer_handles_long_text(fr_tokenizer): + text = """L'histoire du TAL commence dans les années 1950, bien que l'on puisse \ +trouver des travaux antérieurs. En 1950, Alan Turing éditait un article \ +célèbre sous le titre « Computing machinery and intelligence » qui propose ce \ +qu'on appelle à présent le test de Turing comme critère d'intelligence. \ +Ce critère dépend de la capacité d'un programme informatique de personnifier \ +un humain dans une conversation écrite en temps réel, de façon suffisamment \ +convaincante que l'interlocuteur humain ne peut distinguer sûrement — sur la \ +base du seul contenu de la conversation — s'il interagit avec un programme \ +ou avec un autre vrai humain.""" + + tokens = fr_tokenizer(text) + assert len(tokens) == 113 diff --git a/spacy/tests/regression/test_issue759.py b/spacy/tests/regression/test_issue759.py new file mode 100644 index 000000000..b7cf69f1a --- /dev/null +++ b/spacy/tests/regression/test_issue759.py @@ -0,0 +1,12 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True), + ("teneleven", False)]) +def test_issue759(en_tokenizer, text, is_num): + """Test that numbers are recognised correctly.""" + tokens = en_tokenizer(text) + assert tokens[0].like_num == is_num diff --git a/spacy/tests/regression/test_issue768.py b/spacy/tests/regression/test_issue768.py new file mode 100644 index 000000000..d8c8be80b --- /dev/null +++ b/spacy/tests/regression/test_issue768.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...language import Language +from ...attrs import LANG +from ...fr.language_data import TOKENIZER_EXCEPTIONS, STOP_WORDS +from ...language_data.punctuation import TOKENIZER_INFIXES, ALPHA + +import pytest + + +@pytest.fixture +def fr_tokenizer_w_infix(): + SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA) + + # create new Language subclass to add to default infixes + class French(Language): + lang = 'fr' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fr' + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + [SPLIT_INFIX] + + return French.Defaults.create_tokenizer() + + +@pytest.mark.parametrize('text,expected_tokens', [("l'avion", ["l'", "avion"]), + ("j'ai", ["j'", "ai"])]) +def test_issue768(fr_tokenizer_w_infix, text, expected_tokens): + """Allow zero-width 'infix' token during the tokenization process.""" + tokens = fr_tokenizer_w_infix(text) + assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens diff --git a/spacy/tests/regression/test_issue775.py b/spacy/tests/regression/test_issue775.py new file mode 100644 index 000000000..fe1c89240 --- /dev/null +++ b/spacy/tests/regression/test_issue775.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["Shell", "shell"]) +def test_issue775(en_tokenizer, text): + """Test that 'Shell' and 'shell' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e83c4a75..8f2f111e7 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -289,21 +289,18 @@ cdef class Tokenizer: infix_end = match.end() if infix_start == start: continue - if infix_start == infix_end: - msg = ("Tokenizer found a zero-width 'infix' token.\n" - "If you're using a built-in tokenizer, please\n" - "report this bug. If you're using a tokenizer\n" - "you developed, check your TOKENIZER_INFIXES\n" - "tuple.\n" - "String being matched: {string}\n" - "Language: {lang}") - raise ValueError(msg.format(string=string, lang=self.vocab.lang)) span = string[start:infix_start] tokens.push_back(self.vocab.get(tokens.mem, span), False) - - infix_span = string[infix_start:infix_end] - tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + + if infix_start != infix_end: + # If infix_start != infix_end, it means the infix + # token is non-empty. Empty infix tokens are useful + # for tokenization in some languages (see + # https://github.com/explosion/spaCy/issues/768) + infix_span = string[infix_start:infix_end] + tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + start = infix_end span = string[start:] tokens.push_back(self.vocab.get(tokens.mem, span), False) diff --git a/website/_harp.json b/website/_harp.json index 04a66f772..e315d658c 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,10 +12,10 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.5", + "SPACY_VERSION": "1.6", "LATEST_NEWS": { - "url": "https://explosion.ai/blog/spacy-user-survey", - "title": "The results of the spaCy user survey" + "url": "https://explosion.ai/blog/deep-learning-formula-nlp", + "title": "The new deep learning formula for state-of-the-art NLP models" }, "SOCIAL": { @@ -54,9 +54,9 @@ } }, - "V_CSS": "1.14", + "V_CSS": "1.15", "V_JS": "1.0", - "DEFAULT_SYNTAX" : "python", + "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", "MAILCHIMP": { "user": "spacy.us12", diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index bc8b85557..ea4d2964d 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -113,7 +113,7 @@ mixin gitter(button, label) //- Logo mixin logo() - +svg("graphics", "spacy", 500).o-logo&attributes(attributes) + +svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes) //- Landing diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index 7aaaef787..1be4b17d5 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -83,7 +83,7 @@ //- Logo .o-logo - @include size($logo-width, auto) + @include size($logo-width, $logo-height) fill: currentColor vertical-align: middle margin: 0 0.5rem diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index 6844517d7..9029161e2 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -11,6 +11,7 @@ $aside-width: 30vw $aside-padding: 25px $logo-width: 85px +$logo-height: 27px $grid: ( quarter: 4, third: 3, half: 2, two-thirds: 1.5, three-quarters: 1.33 ) $breakpoints: ( sm: 768px, md: 992px, lg: 1200px ) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index fab1dd86b..72fe34f8c 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -51,14 +51,14 @@ p A container for accessing linguistic annotations. +cell dict +cell | A dictionary that allows customisation of properties of - | #[code Token] chldren. + | #[code Token] children. +row +cell #[code user_span_hooks] +cell dict +cell | A dictionary that allows customisation of properties of - | #[code Span] chldren. + | #[code Span] children. +h(2, "init") Doc.__init__ +tag method diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index a07ee25d9..770ee3e9b 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -25,7 +25,7 @@ p A slice from a #[code Doc] object. +row +cell #[code start_char] +cell int - +cell The character offset for the end of the span. + +cell The character offset for the start of the span. +row +cell #[code end_char] diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 932abc99e..9681cb6ea 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -232,7 +232,7 @@ "NLP with spaCy in 10 lines of code": { "url": "https://github.com/cytora/pycon-nlp-in-10-lines", "author": "Andraz Hribernik et al. (Cytora)", - "tags": [ "jupyter" ] + "tags": ["jupyter"] }, "Intro to NLP with spaCy": { "url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/", @@ -241,7 +241,7 @@ "NLP with spaCy and IPython Notebook": { "url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/", "author": "Dustin Miller (SharePoint)", - "tags": [ "jupyter" ] + "tags": ["jupyter"] }, "Getting Started with spaCy": { "url": "http://textminingonline.com/getting-started-with-spacy", @@ -254,7 +254,7 @@ "NLP (almost) From Scratch - POS Network with spaCy": { "url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html", "author": "Sujit Pal", - "tags": [ "gensim", "keras" ] + "tags": ["gensim", "keras"] }, "NLP tasks with various libraries": { "url": "http://clarkgrubb.com/nlp", @@ -270,44 +270,48 @@ "Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": { "url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb", "author": "Patrick Harrison (S&P Global)", - "tags": [ "jupyter", "gensim" ] + "tags": ["jupyter", "gensim"] }, - "Deep Learning with custom pipelines and Keras": { "url": "https://explosion.ai/blog/spacy-deep-learning-keras", "author": "Matthew Honnibal", - "tags": [ "keras", "sentiment" ] + "tags": ["keras", "sentiment"] }, "A decomposable attention model for Natural Language Inference": { "url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment", "author": "Matthew Honnibal", - "tags": [ "keras", "similarity" ] + "tags": ["keras", "similarity"] }, "Using the German model": { "url": "https://explosion.ai/blog/german-model", "author": "Wolfgang Seeker", - "tags": [ "multi-lingual" ] + "tags": ["multi-lingual"] }, "Sense2vec with spaCy and Gensim": { "url": "https://explosion.ai/blog/sense2vec-with-spacy", "author": "Matthew Honnibal", - "tags": [ "big data", "gensim" ] + "tags": ["big data", "gensim"] }, "Building your bot's brain with Node.js and spaCy": { "url": "https://explosion.ai/blog/chatbot-node-js-spacy", "author": "Wah Loon Keng", - "tags": [ "bots", "node.js" ] + "tags": ["bots", "node.js"] }, "An intent classifier with spaCy": { "url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/", "author": "Musio", - "tags": [ "bots", "keras" ] + "tags": ["bots", "keras"] }, "Visual Question Answering with spaCy": { "url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook", "author": "Aaditya Prakash", - "tags": [ "vqa", "keras" ] + "tags": ["vqa", "keras"] + }, + "Extracting time suggestions from emails with spaCy": { + "url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2", + "author": "Chris Savvopoulos", + "tags": ["ner"] } }, @@ -315,22 +319,22 @@ "Information extraction": { "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", "author": "Matthew Honnibal", - "tags": [ "snippet" ] + "tags": ["snippet"] }, "Neural bag of words": { "url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py", "author": "Matthew Honnibal", - "tags": [ "sentiment" ] + "tags": ["sentiment"] }, "Part-of-speech tagging": { "url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py", "author": "Matthew Honnibal", - "tags": [ "pos" ] + "tags": ["pos"] }, "Parallel parse": { "url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py", "author": "Matthew Honnibal", - "tags": [ "big data" ] + "tags": ["big data"] }, "Inventory count": { "url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count", @@ -339,8 +343,8 @@ "Multi-word matches": { "url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py", "author": "Matthew Honnibal", - "tags": [ "matcher", "out of date" ] + "tags": ["matcher", "out of date"] } } } -} +} \ No newline at end of file diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c1a03a14a..d43fb438f 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -26,6 +26,9 @@ p | #[+api("tokenizer") #[code Tokenizer]] instance: +code. + import spacy + from spacy.symbols import ORTH, LEMMA, POS + nlp = spacy.load('en') assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] nlp.tokenizer.add_special_case(u'gimme', @@ -37,7 +40,7 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -52,9 +55,9 @@ p | The special case rules have precedence over the punctuation splitting: +code. - nlp.tokenizer.add_special_case(u"...gimme...?", + nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}]) + ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index fde9ee4d7..7650e4a03 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -18,7 +18,9 @@ p Here's a minimal example. We first add a pattern that specifies three tokens: p | Once we've added the pattern, we can use the #[code matcher] as a - | callable, to receive a list of #[code (ent_id, start, end)] tuples: + | callable, to receive a list of #[code (ent_id, start, end)] tuples. + | Note that #[code LOWER] and #[code IS_PUNCT] are data attributes + | of #[code Matcher.attrs]. +code. from spacy.matcher import Matcher