mirror of https://github.com/explosion/spaCy.git
Fixed hyphen handling in the Hungarian tokenizer.
This commit is contained in:
parent
f77c0284d6
commit
63037e79af
|
@ -15,7 +15,7 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
|
|||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))
|
||||
|
||||
TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES
|
||||
TOKENIZER_PREFIXES = TOKENIZER_PREFIXES
|
||||
TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
|
||||
TOKENIZER_INFIXES = TOKENIZER_INFIXES
|
||||
|
||||
|
|
|
@ -6,6 +6,13 @@ from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPH
|
|||
|
||||
CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿"
|
||||
|
||||
TOKENIZER_PREFIXES = (
|
||||
[r'\+'] +
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
LIST_QUOTES
|
||||
)
|
||||
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
|
@ -32,4 +39,4 @@ TOKENIZER_INFIXES = (
|
|||
]
|
||||
)
|
||||
|
||||
__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]
|
||||
|
|
|
@ -539,13 +539,13 @@ OTHER_EXC = """
|
|||
ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)"
|
||||
_NUM = "[+\-]?\d+([,.]\d+)*"
|
||||
_OPS = "[=<>+\-\*/^()÷%²]"
|
||||
_SUFFIES = "-[{a}]+".format(a=ALPHA_LOWER)
|
||||
_SUFFIXES = "-[{a}]+".format(a=ALPHA_LOWER)
|
||||
NUMERIC_EXP = "({n})(({o})({n}))*[%]?".format(n=_NUM, o=_OPS)
|
||||
TIME_EXP = "\d+(:\d+)*(\.\d+)?"
|
||||
|
||||
NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format(
|
||||
ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE,
|
||||
c=CURRENCY, s=_SUFFIES
|
||||
c=CURRENCY, s=_SUFFIXES
|
||||
)
|
||||
|
||||
TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=_URL_PATTERN, n=NUMS)).match
|
||||
|
|
|
@ -58,7 +58,7 @@ LIST_HYPHENS = list(_HYPHENS.strip().split())
|
|||
|
||||
|
||||
ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '')
|
||||
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace(' ', '')
|
||||
ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace('\n', '')
|
||||
ALPHA = ALPHA_LOWER + ALPHA_UPPER
|
||||
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ HYPHEN_TESTS = [
|
|||
('Dinnye-domb-.', ['Dinnye-domb-', '.']),
|
||||
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']),
|
||||
('Lakik-e', ['Lakik', '-e']),
|
||||
('A--B', ['A', '--' 'B']),
|
||||
('A--B', ['A', '--', 'B']),
|
||||
('Lakik-e?', ['Lakik', '-e', '?']),
|
||||
('Lakik-e.', ['Lakik', '-e', '.']),
|
||||
('Lakik-e...', ['Lakik', '-e', '...']),
|
||||
|
@ -42,6 +42,7 @@ HYPHEN_TESTS = [
|
|||
('A 7-es.', ['A', '7-es', '.']),
|
||||
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']),
|
||||
('A %-sal.', ['A', '%-sal', '.']),
|
||||
('A $-sal.', ['A', '$-sal', '.']),
|
||||
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.'])
|
||||
]
|
||||
|
||||
|
@ -206,7 +207,7 @@ NUMBER_TESTS = [
|
|||
('A 5$-ban', ['A', '5$-ban']),
|
||||
('A 5$.', ['A', '5', '$', '.']),
|
||||
('A 5$', ['A', '5', '$']),
|
||||
('A $5', ['A', '$', '5']),
|
||||
('A $5', ['A', '$5']),
|
||||
('A 5km/h', ['A', '5', 'km/h']),
|
||||
('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']),
|
||||
('A 5km/h.', ['A', '5', 'km/h', '.']),
|
||||
|
@ -247,7 +248,8 @@ WIKI_TESTS = [
|
|||
('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']),
|
||||
]
|
||||
|
||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS # + HYPHEN_TESTS # + WIKI_TESTS
|
||||
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS # + WIKI_TESTS
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
||||
def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
|
||||
|
|
Loading…
Reference in New Issue