mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of ssh://github.com/explosion/spaCy
This commit is contained in:
commit
afd622fe04
|
@ -1,27 +1,19 @@
|
|||
<!--- Provide a general summary of your changes in the Title -->
|
||||
|
||||
## Description
|
||||
<!--- Describe your changes -->
|
||||
<!--- Use this section to describe your changes and how they're affecting the code. -->
|
||||
<!-- If your changes required testing, include information about the testing environment and the tests you ran. -->
|
||||
|
||||
## Motivation and Context
|
||||
<!--- Why is this change required? What problem does it solve? -->
|
||||
<!--- If fixing an open issue, please link to the issue here. -->
|
||||
|
||||
## How Has This Been Tested?
|
||||
<!--- Please describe in detail your tests. Did you add new tests? -->
|
||||
<!--- Include details of your testing environment, and the tests you ran too -->
|
||||
<!--- How were other areas of the code affected? -->
|
||||
|
||||
## Types of changes
|
||||
<!--- What types of changes does your code introduce? Put an `x` in all applicable boxes.: -->
|
||||
- [ ] Bug fix (non-breaking change fixing an issue)
|
||||
- [ ] New feature (non-breaking change adding functionality to spaCy)
|
||||
- [ ] Breaking change (fix or feature causing change to spaCy's existing functionality)
|
||||
- [ ] Documentation (Addition to documentation of spaCy)
|
||||
- [ ] **Bug fix** (non-breaking change fixing an issue)
|
||||
- [ ] **New feature** (non-breaking change adding functionality to spaCy)
|
||||
- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality)
|
||||
- [ ] **Documentation** (addition to documentation of spaCy)
|
||||
|
||||
## Checklist:
|
||||
<!--- Go over all the following points, and put an `x` in all applicable boxes.: -->
|
||||
- [ ] My code follows spaCy's code style.
|
||||
- [ ] My change requires a change to spaCy's documentation.
|
||||
- [ ] I have updated the documentation accordingly.
|
||||
- [ ] I have added tests to cover my changes.
|
||||
|
|
|
@ -76,7 +76,7 @@ Next, create a test file named `test_issue[ISSUE NUMBER].py` in the [`spacy/test
|
|||
|
||||
## Adding tests
|
||||
|
||||
spaCy uses [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/spacy/tests/tokenizer`](spacy/tests/tokenizer). To be interpreted and run, all test files and test functions need to be prefixed with `test_`.
|
||||
spaCy uses the [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html). Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/spacy/tests/tokenizer`](spacy/tests/tokenizer). To be interpreted and run, all test files and test functions need to be prefixed with `test_`.
|
||||
|
||||
When adding tests, make sure to use descriptive names, keep the code short and concise and only test for one behaviour at a time. Try to `parametrize` test cases wherever possible, use our pre-defined fixtures for spaCy components and avoid unnecessary imports.
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
|
|||
* Maxim Samsonov, [@maxirmx](https://github.com/maxirmx)
|
||||
* Oleg Zd, [@olegzd](https://github.com/olegzd)
|
||||
* Pokey Rule, [@pokey](https://github.com/pokey)
|
||||
* Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202)
|
||||
* Rob van Nieuwpoort, [@RvanNieuwpoort](https://github.com/RvanNieuwpoort)
|
||||
* Sam Bozek, [@sambozek](https://github.com/sambozek)
|
||||
* Sasho Savkov [@savkov](https://github.com/savkov)
|
||||
|
|
|
@ -54,7 +54,7 @@ released under the MIT license.
|
|||
| **Usage questions** | `StackOverflow <http://stackoverflow.com/questions/tagged/spacy>`_, `Reddit usergroup |
|
||||
| | <https://www.reddit.com/r/spacynlp>`_, `Gitter chat <https://gitter.im/explosion/spaCy>`_ |
|
||||
+---------------------------+------------------------------------------------------------------------------------------------------------+
|
||||
| **General discussion** | `Reddit usergroup <https://www.reddit.com/r/spacynlp>`_, |
|
||||
| **General discussion** | `Reddit usergroup <https://www.reddit.com/r/spacynlp>`_, |
|
||||
| | `Gitter chat <https://gitter.im/explosion/spaCy>`_ |
|
||||
+---------------------------+------------------------------------------------------------------------------------------------------------+
|
||||
| **Commercial support** | contact@explosion.ai |
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
|
||||
__title__ = 'spacy'
|
||||
__version__ = '1.6.0'
|
||||
__summary__ = 'Industrial-strength NLP'
|
||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
|
||||
__uri__ = 'https://spacy.io'
|
||||
__author__ = 'Matthew Honnibal'
|
||||
__email__ = 'matt@spacy.io'
|
||||
__email__ = 'matt@explosion.ai'
|
||||
__license__ = 'MIT'
|
||||
__models__ = {
|
||||
'en': 'en>=1.1.0,<1.2.0',
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
import sputnik
|
||||
from sputnik.package_list import (PackageNotFoundException,
|
||||
|
|
|
@ -7,7 +7,7 @@ from ..language_data import PRON_LEMMA
|
|||
|
||||
EXC = {}
|
||||
|
||||
EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "were", "Were", "Well", "well", "Whore", "whore"]
|
||||
EXCLUDE_EXC = ["Ill", "ill", "Its", "its", "Hell", "hell", "Shell", "shell", "were", "Were", "Well", "well", "Whore", "whore"]
|
||||
|
||||
|
||||
# Pronouns
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from os import path
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class French(Language):
|
||||
|
@ -18,3 +17,4 @@ class French(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
|
|
@ -4,6 +4,9 @@ from __future__ import unicode_literals
|
|||
from .. import language_data as base
|
||||
from ..language_data import strings_to_exc, update_exc
|
||||
|
||||
from .punctuation import ELISION
|
||||
|
||||
from ..symbols import *
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
|
@ -13,5 +16,53 @@ STOP_WORDS = set(STOP_WORDS)
|
|||
TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
||||
|
||||
ABBREVIATIONS = {
|
||||
"janv.": [
|
||||
{LEMMA: "janvier", ORTH: "janv."}
|
||||
],
|
||||
"févr.": [
|
||||
{LEMMA: "février", ORTH: "févr."}
|
||||
],
|
||||
"avr.": [
|
||||
{LEMMA: "avril", ORTH: "avr."}
|
||||
],
|
||||
"juill.": [
|
||||
{LEMMA: "juillet", ORTH: "juill."}
|
||||
],
|
||||
"sept.": [
|
||||
{LEMMA: "septembre", ORTH: "sept."}
|
||||
],
|
||||
"oct.": [
|
||||
{LEMMA: "octobre", ORTH: "oct."}
|
||||
],
|
||||
"nov.": [
|
||||
{LEMMA: "novembre", ORTH: "nov."}
|
||||
],
|
||||
"déc.": [
|
||||
{LEMMA: "décembre", ORTH: "déc."}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
INFIXES_EXCEPTIONS_BASE = ["aujourd'hui",
|
||||
"prud'homme", "prud'hommes",
|
||||
"prud'homal", "prud'homaux", "prud'homale",
|
||||
"prud'homales",
|
||||
"prud'hommal", "prud'hommaux", "prud'hommale",
|
||||
"prud'hommales",
|
||||
"prud'homie", "prud'homies",
|
||||
"prud'hommesque", "prud'hommesques",
|
||||
"prud'hommesquement"]
|
||||
|
||||
INFIXES_EXCEPTIONS = []
|
||||
for elision_char in ELISION:
|
||||
INFIXES_EXCEPTIONS += [infix.replace("'", elision_char)
|
||||
for infix in INFIXES_EXCEPTIONS_BASE]
|
||||
|
||||
INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS]
|
||||
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS)
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# encoding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES
|
||||
|
||||
|
||||
_ELISION = " ' ’ "
|
||||
ELISION = _ELISION.strip().replace(' ', '').replace('\n', '')
|
||||
|
||||
TOKENIZER_INFIXES += [
|
||||
r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION),
|
||||
]
|
||||
|
||||
|
||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
|
@ -1,7 +1,7 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from spacy.hu.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .tokenizer_exceptions import TOKEN_MATCH
|
||||
from .language_data import *
|
||||
from ..attrs import LANG
|
||||
from ..language import Language
|
||||
|
|
|
@ -108,11 +108,12 @@ cpdef bint like_url(unicode string):
|
|||
|
||||
|
||||
# TODO: This should live in the language.orth
|
||||
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
||||
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
|
||||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||
'eighty ninety hundred thousand million billion trillion'
|
||||
'quadrillion gajillion bazillion'.split())
|
||||
NUM_WORDS = set('''
|
||||
zero one two three four five six seven eight nine ten eleven twelve thirteen
|
||||
fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty
|
||||
sixty seventy eighty ninety hundred thousand million billion trillion
|
||||
quadrillion gajillion bazillion
|
||||
'''.split())
|
||||
cpdef bint like_number(unicode string):
|
||||
string = string.replace(',', '')
|
||||
string = string.replace('.', '')
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
# spaCy tests
|
||||
|
||||
spaCy uses [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html).
|
||||
spaCy uses the [pytest](http://doc.pytest.org/) framework for testing. For more info on this, see the [pytest documentation](http://docs.pytest.org/en/latest/contents.html).
|
||||
|
||||
Tests for spaCy modules and classes live in their own directories of the same name. For example, tests for the `Tokenizer` can be found in [`/tests/tokenizer`](tokenizer). All test modules (i.e. directories) also need to be listed in spaCy's [`setup.py`](../setup.py). To be interpreted and run, all test files and test functions need to be prefixed with `test_`.
|
||||
|
||||
|
|
|
@ -52,6 +52,11 @@ def de_tokenizer():
|
|||
return German.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fr_tokenizer():
|
||||
return French.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hu_tokenizer():
|
||||
return Hungarian.Defaults.create_tokenizer()
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# coding: utf-8
|
|
@ -0,0 +1,30 @@
|
|||
# coding: utf-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["aujourd'hui", "Aujourd'hui", "prud'hommes",
|
||||
"prud’hommal"])
|
||||
def test_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,lemma', [("janv.", "janvier"),
|
||||
("juill.", "juillet"),
|
||||
("sept.", "septembre")])
|
||||
def test_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].lemma_ == lemma
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(fr_tokenizer):
|
||||
text = "Je suis allé au mois de janv. aux prud’hommes."
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 10
|
||||
assert tokens[6].text == "janv."
|
||||
assert tokens[6].lemma_ == "janvier"
|
||||
assert tokens[8].text == "prud’hommes"
|
|
@ -0,0 +1,19 @@
|
|||
# encoding: utf8
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def test_tokenizer_handles_long_text(fr_tokenizer):
|
||||
text = """L'histoire du TAL commence dans les années 1950, bien que l'on puisse \
|
||||
trouver des travaux antérieurs. En 1950, Alan Turing éditait un article \
|
||||
célèbre sous le titre « Computing machinery and intelligence » qui propose ce \
|
||||
qu'on appelle à présent le test de Turing comme critère d'intelligence. \
|
||||
Ce critère dépend de la capacité d'un programme informatique de personnifier \
|
||||
un humain dans une conversation écrite en temps réel, de façon suffisamment \
|
||||
convaincante que l'interlocuteur humain ne peut distinguer sûrement — sur la \
|
||||
base du seul contenu de la conversation — s'il interagit avec un programme \
|
||||
ou avec un autre vrai humain."""
|
||||
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 113
|
|
@ -0,0 +1,12 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
|
||||
("teneleven", False)])
|
||||
def test_issue759(en_tokenizer, text, is_num):
|
||||
"""Test that numbers are recognised correctly."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].like_num == is_num
|
|
@ -0,0 +1,36 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...fr.language_data import TOKENIZER_EXCEPTIONS, STOP_WORDS
|
||||
from ...language_data.punctuation import TOKENIZER_INFIXES, ALPHA
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fr_tokenizer_w_infix():
|
||||
SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA)
|
||||
|
||||
# create new Language subclass to add to default infixes
|
||||
class French(Language):
|
||||
lang = 'fr'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
infixes = TOKENIZER_INFIXES + [SPLIT_INFIX]
|
||||
|
||||
return French.Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', [("l'avion", ["l'", "avion"]),
|
||||
("j'ai", ["j'", "ai"])])
|
||||
def test_issue768(fr_tokenizer_w_infix, text, expected_tokens):
|
||||
"""Allow zero-width 'infix' token during the tokenization process."""
|
||||
tokens = fr_tokenizer_w_infix(text)
|
||||
assert len(tokens) == 2
|
||||
assert [t.text for t in tokens] == expected_tokens
|
|
@ -0,0 +1,13 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Shell", "shell"])
|
||||
def test_issue775(en_tokenizer, text):
|
||||
"""Test that 'Shell' and 'shell' are excluded from the contractions
|
||||
generated by the English tokenizer exceptions."""
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
|
@ -289,21 +289,18 @@ cdef class Tokenizer:
|
|||
infix_end = match.end()
|
||||
if infix_start == start:
|
||||
continue
|
||||
if infix_start == infix_end:
|
||||
msg = ("Tokenizer found a zero-width 'infix' token.\n"
|
||||
"If you're using a built-in tokenizer, please\n"
|
||||
"report this bug. If you're using a tokenizer\n"
|
||||
"you developed, check your TOKENIZER_INFIXES\n"
|
||||
"tuple.\n"
|
||||
"String being matched: {string}\n"
|
||||
"Language: {lang}")
|
||||
raise ValueError(msg.format(string=string, lang=self.vocab.lang))
|
||||
|
||||
span = string[start:infix_start]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||
|
||||
infix_span = string[infix_start:infix_end]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
||||
|
||||
if infix_start != infix_end:
|
||||
# If infix_start != infix_end, it means the infix
|
||||
# token is non-empty. Empty infix tokens are useful
|
||||
# for tokenization in some languages (see
|
||||
# https://github.com/explosion/spaCy/issues/768)
|
||||
infix_span = string[infix_start:infix_end]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
||||
|
||||
start = infix_end
|
||||
span = string[start:]
|
||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
"COMPANY_URL": "https://explosion.ai",
|
||||
"DEMOS_URL": "https://demos.explosion.ai",
|
||||
|
||||
"SPACY_VERSION": "1.5",
|
||||
"SPACY_VERSION": "1.6",
|
||||
"LATEST_NEWS": {
|
||||
"url": "https://explosion.ai/blog/spacy-user-survey",
|
||||
"title": "The results of the spaCy user survey"
|
||||
"url": "https://explosion.ai/blog/deep-learning-formula-nlp",
|
||||
"title": "The new deep learning formula for state-of-the-art NLP models"
|
||||
},
|
||||
|
||||
"SOCIAL": {
|
||||
|
@ -54,9 +54,9 @@
|
|||
}
|
||||
},
|
||||
|
||||
"V_CSS": "1.14",
|
||||
"V_CSS": "1.15",
|
||||
"V_JS": "1.0",
|
||||
"DEFAULT_SYNTAX" : "python",
|
||||
"DEFAULT_SYNTAX": "python",
|
||||
"ANALYTICS": "UA-58931649-1",
|
||||
"MAILCHIMP": {
|
||||
"user": "spacy.us12",
|
||||
|
|
|
@ -113,7 +113,7 @@ mixin gitter(button, label)
|
|||
//- Logo
|
||||
|
||||
mixin logo()
|
||||
+svg("graphics", "spacy", 500).o-logo&attributes(attributes)
|
||||
+svg("graphics", "spacy", 675, 215).o-logo&attributes(attributes)
|
||||
|
||||
|
||||
//- Landing
|
||||
|
|
|
@ -83,7 +83,7 @@
|
|||
//- Logo
|
||||
|
||||
.o-logo
|
||||
@include size($logo-width, auto)
|
||||
@include size($logo-width, $logo-height)
|
||||
fill: currentColor
|
||||
vertical-align: middle
|
||||
margin: 0 0.5rem
|
||||
|
|
|
@ -11,6 +11,7 @@ $aside-width: 30vw
|
|||
$aside-padding: 25px
|
||||
|
||||
$logo-width: 85px
|
||||
$logo-height: 27px
|
||||
|
||||
$grid: ( quarter: 4, third: 3, half: 2, two-thirds: 1.5, three-quarters: 1.33 )
|
||||
$breakpoints: ( sm: 768px, md: 992px, lg: 1200px )
|
||||
|
|
|
@ -51,14 +51,14 @@ p A container for accessing linguistic annotations.
|
|||
+cell dict
|
||||
+cell
|
||||
| A dictionary that allows customisation of properties of
|
||||
| #[code Token] chldren.
|
||||
| #[code Token] children.
|
||||
|
||||
+row
|
||||
+cell #[code user_span_hooks]
|
||||
+cell dict
|
||||
+cell
|
||||
| A dictionary that allows customisation of properties of
|
||||
| #[code Span] chldren.
|
||||
| #[code Span] children.
|
||||
|
||||
+h(2, "init") Doc.__init__
|
||||
+tag method
|
||||
|
|
|
@ -25,7 +25,7 @@ p A slice from a #[code Doc] object.
|
|||
+row
|
||||
+cell #[code start_char]
|
||||
+cell int
|
||||
+cell The character offset for the end of the span.
|
||||
+cell The character offset for the start of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end_char]
|
||||
|
|
|
@ -232,7 +232,7 @@
|
|||
"NLP with spaCy in 10 lines of code": {
|
||||
"url": "https://github.com/cytora/pycon-nlp-in-10-lines",
|
||||
"author": "Andraz Hribernik et al. (Cytora)",
|
||||
"tags": [ "jupyter" ]
|
||||
"tags": ["jupyter"]
|
||||
},
|
||||
"Intro to NLP with spaCy": {
|
||||
"url": "https://nicschrading.com/project/Intro-to-NLP-with-spaCy/",
|
||||
|
@ -241,7 +241,7 @@
|
|||
"NLP with spaCy and IPython Notebook": {
|
||||
"url": "http://blog.sharepointexperience.com/2016/01/nlp-and-sharepoint-part-1/",
|
||||
"author": "Dustin Miller (SharePoint)",
|
||||
"tags": [ "jupyter" ]
|
||||
"tags": ["jupyter"]
|
||||
},
|
||||
"Getting Started with spaCy": {
|
||||
"url": "http://textminingonline.com/getting-started-with-spacy",
|
||||
|
@ -254,7 +254,7 @@
|
|||
"NLP (almost) From Scratch - POS Network with spaCy": {
|
||||
"url": "http://sujitpal.blogspot.de/2016/07/nlp-almost-from-scratch-implementing.html",
|
||||
"author": "Sujit Pal",
|
||||
"tags": [ "gensim", "keras" ]
|
||||
"tags": ["gensim", "keras"]
|
||||
},
|
||||
"NLP tasks with various libraries": {
|
||||
"url": "http://clarkgrubb.com/nlp",
|
||||
|
@ -270,44 +270,48 @@
|
|||
"Modern NLP in Python – What you can learn about food by analyzing a million Yelp reviews": {
|
||||
"url": "http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb",
|
||||
"author": "Patrick Harrison (S&P Global)",
|
||||
"tags": [ "jupyter", "gensim" ]
|
||||
"tags": ["jupyter", "gensim"]
|
||||
},
|
||||
|
||||
"Deep Learning with custom pipelines and Keras": {
|
||||
"url": "https://explosion.ai/blog/spacy-deep-learning-keras",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "keras", "sentiment" ]
|
||||
"tags": ["keras", "sentiment"]
|
||||
},
|
||||
"A decomposable attention model for Natural Language Inference": {
|
||||
"url": "https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "keras", "similarity" ]
|
||||
"tags": ["keras", "similarity"]
|
||||
},
|
||||
|
||||
"Using the German model": {
|
||||
"url": "https://explosion.ai/blog/german-model",
|
||||
"author": "Wolfgang Seeker",
|
||||
"tags": [ "multi-lingual" ]
|
||||
"tags": ["multi-lingual"]
|
||||
},
|
||||
"Sense2vec with spaCy and Gensim": {
|
||||
"url": "https://explosion.ai/blog/sense2vec-with-spacy",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "big data", "gensim" ]
|
||||
"tags": ["big data", "gensim"]
|
||||
},
|
||||
"Building your bot's brain with Node.js and spaCy": {
|
||||
"url": "https://explosion.ai/blog/chatbot-node-js-spacy",
|
||||
"author": "Wah Loon Keng",
|
||||
"tags": [ "bots", "node.js" ]
|
||||
"tags": ["bots", "node.js"]
|
||||
},
|
||||
"An intent classifier with spaCy": {
|
||||
"url": "http://blog.themusio.com/2016/07/18/musios-intent-classifier-2/",
|
||||
"author": "Musio",
|
||||
"tags": [ "bots", "keras" ]
|
||||
"tags": ["bots", "keras"]
|
||||
},
|
||||
"Visual Question Answering with spaCy": {
|
||||
"url": "http://iamaaditya.github.io/2016/04/visual_question_answering_demo_notebook",
|
||||
"author": "Aaditya Prakash",
|
||||
"tags": [ "vqa", "keras" ]
|
||||
"tags": ["vqa", "keras"]
|
||||
},
|
||||
"Extracting time suggestions from emails with spaCy": {
|
||||
"url": "https://medium.com/redsift-outbox/what-time-cc9ce0c2aed2",
|
||||
"author": "Chris Savvopoulos",
|
||||
"tags": ["ner"]
|
||||
}
|
||||
},
|
||||
|
||||
|
@ -315,22 +319,22 @@
|
|||
"Information extraction": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "snippet" ]
|
||||
"tags": ["snippet"]
|
||||
},
|
||||
"Neural bag of words": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/nn_text_class.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "sentiment" ]
|
||||
"tags": ["sentiment"]
|
||||
},
|
||||
"Part-of-speech tagging": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/pos_tag.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "pos" ]
|
||||
"tags": ["pos"]
|
||||
},
|
||||
"Parallel parse": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/parallel_parse.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "big data" ]
|
||||
"tags": ["big data"]
|
||||
},
|
||||
"Inventory count": {
|
||||
"url": "https://github.com/explosion/spaCy/tree/master/examples/inventory_count",
|
||||
|
@ -339,8 +343,8 @@
|
|||
"Multi-word matches": {
|
||||
"url": "https://github.com/explosion/spaCy/blob/master/examples/multi_word_matches.py",
|
||||
"author": "Matthew Honnibal",
|
||||
"tags": [ "matcher", "out of date" ]
|
||||
"tags": ["matcher", "out of date"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -26,6 +26,9 @@ p
|
|||
| #[+api("tokenizer") #[code Tokenizer]] instance:
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.symbols import ORTH, LEMMA, POS
|
||||
|
||||
nlp = spacy.load('en')
|
||||
assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
|
||||
nlp.tokenizer.add_special_case(u'gimme',
|
||||
|
@ -37,7 +40,7 @@ p
|
|||
{
|
||||
ORTH: u'me'}])
|
||||
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
|
||||
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
|
||||
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
|
||||
|
||||
p
|
||||
| The special case doesn't have to match an entire whitespace-delimited
|
||||
|
@ -52,9 +55,9 @@ p
|
|||
| The special case rules have precedence over the punctuation splitting:
|
||||
|
||||
+code.
|
||||
nlp.tokenizer.add_special_case(u"...gimme...?",
|
||||
nlp.tokenizer.add_special_case(u'...gimme...?',
|
||||
[{
|
||||
ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}])
|
||||
ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
|
||||
assert len(nlp(u'...gimme...?')) == 1
|
||||
|
||||
p
|
||||
|
|
|
@ -18,7 +18,9 @@ p Here's a minimal example. We first add a pattern that specifies three tokens:
|
|||
|
||||
p
|
||||
| Once we've added the pattern, we can use the #[code matcher] as a
|
||||
| callable, to receive a list of #[code (ent_id, start, end)] tuples:
|
||||
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
|
||||
| Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
|
||||
| of #[code Matcher.attrs].
|
||||
|
||||
+code.
|
||||
from spacy.matcher import Matcher
|
||||
|
|
Loading…
Reference in New Issue