mirror of https://github.com/explosion/spaCy.git
Merge branch 'pr/1024' into develop
This commit is contained in:
commit
0ee2a22b67
|
@ -4,19 +4,14 @@ from __future__ import unicode_literals
|
|||
from . import util
|
||||
from .util import prints
|
||||
from .deprecated import resolve_model_name
|
||||
import importlib
|
||||
from .cli.info import info
|
||||
from .glossary import explain
|
||||
|
||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
|
||||
|
||||
|
||||
_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French,
|
||||
it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish,
|
||||
fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese)
|
||||
|
||||
|
||||
for _lang in _languages:
|
||||
util.set_lang_class(_lang.lang, _lang)
|
||||
_languages_name = set(["en", "de", "es", "pt", "fr",
|
||||
"it", "hu", "zh", "nl", "sv",
|
||||
"fi", "bn", "he", "nb", "ja"])
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
|
@ -35,7 +30,8 @@ def load(name, **overrides):
|
|||
model_name = ''
|
||||
meta = util.parse_package_meta(data_path, model_name, require=False)
|
||||
lang = meta['lang'] if meta and 'lang' in meta else name
|
||||
cls = util.get_lang_class(lang)
|
||||
module = importlib.import_module("."+lang, "spacy")
|
||||
cls = module.EXPORT
|
||||
overrides['meta'] = meta
|
||||
overrides['path'] = model_path
|
||||
return cls(**overrides)
|
||||
|
|
|
@ -22,3 +22,5 @@ class Bengali(Language):
|
|||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
EXPORT = Bengali
|
|
@ -7,6 +7,8 @@ from ..language import Language
|
|||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
|
||||
class German(Language):
|
||||
|
@ -20,3 +22,10 @@ class German(Language):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOK_UP)
|
||||
|
||||
|
||||
EXPORT = German
|
File diff suppressed because it is too large
Load Diff
|
@ -32,3 +32,6 @@ class English(Language):
|
|||
# Special-case hack for loading the GloVe vectors, to support <1.0
|
||||
overrides = fix_glove_vectors_loading(overrides)
|
||||
Language.__init__(self, **overrides)
|
||||
|
||||
|
||||
EXPORT = English
|
File diff suppressed because it is too large
Load Diff
|
@ -7,7 +7,8 @@ from ..language import Language
|
|||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
class Spanish(Language):
|
||||
lang = 'es'
|
||||
|
@ -19,3 +20,7 @@ class Spanish(Language):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
|
||||
EXPORT = Spanish
|
File diff suppressed because it is too large
Load Diff
|
@ -15,3 +15,6 @@ class Finnish(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
EXPORT = Finnish
|
|
@ -6,7 +6,8 @@ from ..attrs import LANG
|
|||
|
||||
from .language_data import *
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
class FrenchDefaults(BaseDefaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
@ -22,8 +23,15 @@ class FrenchDefaults(BaseDefaults):
|
|||
cls.tokenizer_exceptions = get_tokenizer_exceptions()
|
||||
return super(FrenchDefaults, cls).create_tokenizer(nlp)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOK_UP)
|
||||
|
||||
|
||||
class French(Language):
|
||||
lang = 'fr'
|
||||
|
||||
Defaults = FrenchDefaults
|
||||
|
||||
|
||||
EXPORT = French
|
File diff suppressed because it is too large
Load Diff
|
@ -117,26 +117,30 @@ def get_tokenizer_exceptions():
|
|||
for verb, verb_lemma in (("a", "avoir"), ("est", "être"),
|
||||
("semble", "sembler"), ("indique", "indiquer"),
|
||||
("moque", "moquer"), ("passe", "passer")):
|
||||
for pronoun in ("elle", "il", "on"):
|
||||
token = "{}-t-{}".format(verb, pronoun)
|
||||
for orth in [verb,verb.title()]:
|
||||
for pronoun in ("elle", "il", "on"):
|
||||
token = "{}-t-{}".format(orth, pronoun)
|
||||
VERBS[token] = [
|
||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
||||
{LEMMA: "t", ORTH: "-t"},
|
||||
{LEMMA: pronoun, ORTH: "-" + pronoun}
|
||||
]
|
||||
|
||||
for verb, verb_lemma in [("est","être")]:
|
||||
for orth in [verb,verb.title()]:
|
||||
token = "{}-ce".format(orth)
|
||||
VERBS[token] = [
|
||||
{LEMMA: verb_lemma, ORTH: verb},
|
||||
{LEMMA: "t", ORTH: "-t"},
|
||||
{LEMMA: pronoun, ORTH: "-" + pronoun}
|
||||
{LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"},
|
||||
{LEMMA: 'ce', ORTH: '-ce'}
|
||||
]
|
||||
|
||||
VERBS['est-ce'] = [
|
||||
{LEMMA: 'être', ORTH: "est"},
|
||||
{LEMMA: 'ce', ORTH: '-ce'}
|
||||
]
|
||||
|
||||
for pre, pre_lemma in (("qu'", "que"), ("Qu'", "Que"), ("n'", "ne"),
|
||||
("N'", "Ne")):
|
||||
VERBS['{}est-ce'.format(pre)] = [
|
||||
{LEMMA: pre_lemma, ORTH: pre},
|
||||
{LEMMA: 'être', ORTH: "est"},
|
||||
{LEMMA: 'ce', ORTH: '-ce'}
|
||||
]
|
||||
for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")):
|
||||
for orth in [pre,pre.title()]:
|
||||
VERBS['{}est-ce'.format(orth)] = [
|
||||
{LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"},
|
||||
{LEMMA: 'être', ORTH: "est", TAG: "VERB"},
|
||||
{LEMMA: 'ce', ORTH: '-ce'}
|
||||
]
|
||||
|
||||
HYPHEN = ['-', '‐']
|
||||
|
||||
|
|
|
@ -16,3 +16,6 @@ class Hebrew(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
EXPORT = Hebrew
|
|
@ -5,7 +5,8 @@ from .tokenizer_exceptions import TOKEN_MATCH
|
|||
from .language_data import *
|
||||
from ..attrs import LANG
|
||||
from ..language import Language
|
||||
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
class Hungarian(Language):
|
||||
lang = 'hu'
|
||||
|
@ -24,3 +25,10 @@ class Hungarian(Language):
|
|||
stop_words = set(STOP_WORDS)
|
||||
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOK_UP)
|
||||
|
||||
|
||||
EXPORT = Hungarian
|
File diff suppressed because it is too large
Load Diff
|
@ -5,7 +5,8 @@ from ..language import Language
|
|||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
class Italian(Language):
|
||||
lang = 'it'
|
||||
|
@ -16,3 +17,11 @@ class Italian(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOK_UP)
|
||||
|
||||
|
||||
|
||||
EXPORT = Italian
|
File diff suppressed because it is too large
Load Diff
|
@ -21,3 +21,5 @@ class Japanese(Language):
|
|||
"https://github.com/mocobeta/janome")
|
||||
words = [x.surface for x in Tokenizer().tokenize(text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
EXPORT = Japanese
|
|
@ -22,5 +22,6 @@ TAG_MAP = {
|
|||
"CCONJ": {POS: CCONJ}, # U20
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART}
|
||||
"PART": {POS: PART},
|
||||
"SP": {POS: SPACE}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
class Lemmatizer(Lemmatizer):
|
||||
@classmethod
|
||||
def load(cls, path, lookup):
|
||||
return cls(lookup or {})
|
||||
|
||||
def __init__(self, lookup):
|
||||
self.lookup = lookup
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
try:
|
||||
return set([self.lookup[string]])
|
||||
except:
|
||||
return set([string])
|
|
@ -23,3 +23,6 @@ class Norwegian(Language):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
#tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
EXPORT = Norwegian
|
|
@ -15,3 +15,6 @@ class Dutch(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
EXPORT = Dutch
|
|
@ -5,7 +5,8 @@ from ..language import Language
|
|||
from ..attrs import LANG
|
||||
|
||||
from .language_data import *
|
||||
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
class Portuguese(Language):
|
||||
lang = 'pt'
|
||||
|
@ -16,3 +17,10 @@ class Portuguese(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOK_UP)
|
||||
|
||||
|
||||
EXPORT = Portuguese
|
File diff suppressed because it is too large
Load Diff
|
@ -4,7 +4,8 @@ from __future__ import unicode_literals, print_function
|
|||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
from .language_data import *
|
||||
|
||||
from ..lemmatizerlookup import Lemmatizer
|
||||
from .lemmatization import LOOK_UP
|
||||
|
||||
class Swedish(Language):
|
||||
lang = 'sv'
|
||||
|
@ -15,3 +16,10 @@ class Swedish(Language):
|
|||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOK_UP)
|
||||
|
||||
|
||||
EXPORT = Swedish
|
File diff suppressed because it is too large
Load Diff
|
@ -129,6 +129,10 @@ def EN():
|
|||
def DE():
|
||||
return German()
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def FR():
|
||||
return French()
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption("--models", action="store_true",
|
||||
|
|
|
@ -37,4 +37,29 @@ def test_tokenizer_handles_exc_in_text_2(fr_tokenizer):
|
|||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 11
|
||||
assert tokens[1].text == "après-midi"
|
||||
assert tokens[9].text == "italo-mexicain"
|
||||
assert tokens[9].text == "italo-mexicain"
|
||||
|
||||
def test_tokenizer_handles_title(fr_tokenizer):
|
||||
text = "N'est-ce pas génial?"
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[0].text == "N'"
|
||||
assert tokens[0].lemma_ == "ne"
|
||||
assert tokens[1].text == "est"
|
||||
assert tokens[1].lemma_ == "être"
|
||||
assert tokens[2].text == "-ce"
|
||||
assert tokens[2].lemma_ == "ce"
|
||||
|
||||
def test_tokenizer_handles_title_2(fr_tokenizer):
|
||||
text = "Est-ce pas génial?"
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
assert tokens[0].text == "Est"
|
||||
assert tokens[0].lemma_ == "être"
|
||||
|
||||
def test_tokenizer_handles_title_2(fr_tokenizer):
|
||||
text = "Qu'est-ce que tu fais?"
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 7
|
||||
assert tokens[0].text == "Qu'"
|
||||
assert tokens[0].lemma_ == "que"
|
|
@ -0,0 +1,37 @@
|
|||
# coding: utf-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_lemmatizer_verb(FR):
|
||||
text = "Qu'est-ce que tu fais?"
|
||||
tokens = FR(text)
|
||||
assert tokens[0].lemma_ == "que"
|
||||
assert tokens[1].lemma_ == "être"
|
||||
assert tokens[5].lemma_ == "faire"
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.xfail(reason="sont tagged as AUX")
|
||||
def test_lemmatizer_noun_verb_2(FR):
|
||||
text = "Les abaissements de température sont gênants."
|
||||
tokens = FR(text)
|
||||
assert tokens[4].lemma_ == "être"
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
||||
def test_lemmatizer_noun(FR):
|
||||
text = "il y a des Costaricienne."
|
||||
tokens = FR(text)
|
||||
assert tokens[4].lemma_ == "Costaricain"
|
||||
|
||||
@pytest.mark.models
|
||||
def test_lemmatizer_noun_2(FR):
|
||||
text = "Les abaissements de température sont gênants."
|
||||
tokens = FR(text)
|
||||
assert tokens[1].lemma_ == "abaissement"
|
||||
assert tokens[5].lemma_ == "gênant"
|
||||
|
||||
|
|
@ -9,3 +9,6 @@ class Chinese(Language):
|
|||
import jieba
|
||||
words = list(jieba.cut(text, cut_all=True))
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
EXPORT = Chinese
|
Loading…
Reference in New Issue