mirror of https://github.com/explosion/spaCy.git
Adding partial hyphen and quote handling support.
This commit is contained in:
parent
2051726fd3
commit
0cf2144d24
|
@ -53,7 +53,8 @@ _
|
|||
\.\.
|
||||
\.\.\.
|
||||
\.\.\.\.
|
||||
(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\.
|
||||
(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”-])\.
|
||||
(?<=[a-züóőúéáűí)])-e
|
||||
\-\-
|
||||
´
|
||||
(?<=[0-9])km²
|
||||
|
@ -98,14 +99,17 @@ _
|
|||
(?<=[0-9])kb
|
||||
'''.strip().split('\n')
|
||||
|
||||
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
||||
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
|
||||
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
|
||||
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) (?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) '''
|
||||
r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) (?<=[0-9])-(?=[0-9]) '''
|
||||
r'''(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])''').split()
|
||||
|
||||
ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
|
||||
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"\w*\(\w+\)\w*": [{"F": "???"}],
|
||||
"-e": [{"F": "-e"}],
|
||||
|
||||
"vs.": [{"F": "vs."}],
|
||||
|
||||
"''": [{"F": "''"}],
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
# TOKEN hyphen
|
||||
|
||||
-nak, -nek es ehhez hasonlok
|
||||
IN : Egy -nak, -jaiért, -magyar, bel- van.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>-nak</w><c>,</c><ws> </ws><w>-jaiért</w><c>,</c><ws> </ws><w>-magyar</w><c>,</c><ws> </ws><w>bel-</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Egy -nak.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>-nak</w><c>.</c></s>
|
||||
IN : Egy bel-.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>bel-</w><c>.</c></s>
|
||||
IN : Dinnye-domb-.
|
||||
OUT: <s><w>Dinnye-domb-</w><c>.</c></s>
|
||||
|
||||
kulonvalt '-e'
|
||||
IN : Ezen -e elcsatangolt.
|
||||
OUT: <s><w>Ezen</w><ws> </ws><w>-e</w><ws> </ws><w>elcsatangolt</w><c>.</c></s>
|
||||
|
||||
-e levagasa, zarojel nelkul
|
||||
IN : Lakik-e
|
||||
OUT: <s><w>Lakik</w><w>-e</w></s>
|
||||
IN : Lakik-e?
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>?</c></s>
|
||||
IN : Lakik-e.
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>.</c></s>
|
||||
IN : Lakik-e...
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>...</c></s>
|
||||
IN : Lakik-e... van.
|
||||
OUT: <s><w>Lakik</w><w>-e</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Lakik-e van?
|
||||
OUT: <s><w>Lakik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
|
||||
# TODO: adapt spacy to handle such brackets
|
||||
zarojeles mondatkozi valtozatok
|
||||
#IN : (La)kik-e van?
|
||||
#OUT: <s><w>(La)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
#IN : L(a)kik-e van?
|
||||
#OUT: <s><w>L(a)kik</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
#IN : Lak(ik)-e van?
|
||||
#OUT: <s><w>Lak(ik)</w><w>-e</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
|
||||
# TODO: adapt spacy to handle such brackets
|
||||
zarojeles mondatvegi valtozatok
|
||||
#IN : (La)kik-e.
|
||||
#OUT: <s><w>(La)kik</w><w>-e</w><c>.</c></s>
|
||||
#IN : L(a)kik-e.
|
||||
#OUT: <s><w>L(a)kik</w><w>-e</w><c>.</c></s>
|
||||
#IN : Lak(ik)-e.
|
||||
#OUT: <s><w>Lak(ik)</w><w>-e</w><c>.</c></s>
|
||||
|
||||
kontroll
|
||||
IN : Lakik-elem van?
|
||||
OUT: <s><w>Lakik-elem</w><ws> </ws><w>van</w><c>?</c></s>
|
||||
IN : Van lakik-elem.
|
||||
OUT: <s><w>Van</w><ws> </ws><w>lakik-elem</w><c>.</c></s>
|
||||
IN : A 7-es busz?
|
||||
OUT: <s><w>A</w><ws> </ws><w>7-es</w><ws> </ws><w>busz</w><c>?</c></s>
|
||||
IN : A 7-es?
|
||||
OUT: <s><w>A</w><ws> </ws><w>7-es</w><c>?</c></s>
|
||||
IN : A 7-es.
|
||||
OUT: <s><w>A</w><ws> </ws><w>7-es</w><c>.</c></s>
|
||||
|
||||
problemas eset, megengedjuk # TODO: works erroundously in HunToken, but OK in spacy
|
||||
IN : Ez (lakik)-e?
|
||||
OUT: <s><w>Ez</w><ws> </ws><c>(</c><w>lakik</w><c>)</c><w>-e</w><c>?</c></s>
|
||||
|
||||
TODO: macska-/kutyavilag
|
||||
IN : A macska-/kutyavilag van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>macska-</w><c>/</c><w>kutyavilag</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
||||
%-, §-
|
||||
IN : A §-sal.
|
||||
OUT: <s><w>A</w><ws> </ws><w>§-sal</w><c>.</c></s>
|
||||
IN : A %-sal.
|
||||
OUT: <s><w>A</w><ws> </ws><w>%-sal</w><c>.</c></s>
|
||||
|
||||
tobb kotojel
|
||||
IN : A CD-ROM-okrol.
|
||||
OUT: <s><w>A</w><ws> </ws><w>CD-ROM-okrol</w><c>.</c></s>
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
# TOKEN quote
|
||||
|
||||
mondatban
|
||||
IN : Az "Ime, hat"-ban irja.
|
||||
OUT: <s><w>Az</w><ws> </ws><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><w>-ban</w><ws> </ws><w>irja</w><c>.</c></s>
|
||||
|
||||
mondat elejen
|
||||
IN : "Ime, hat"-ban irja.
|
||||
OUT: <s><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><w>-ban</w><ws> </ws><w>irja</w><c>.</c></s>
|
||||
|
||||
mondat vegen
|
||||
IN : Az "Ime, hat".
|
||||
OUT: <s><w>Az</w><ws> </ws><c>"</c><w>Ime</w><c>,</c><ws> </ws><w>hat</w><c>"</c><c>.</c></s>
|
||||
|
||||
magaban
|
||||
IN : Egy 24"-os monitor.
|
||||
OUT: <s><w>Egy</w><ws> </ws><w>24</w><c>"</c><w>-os</w><ws> </ws><w>monitor</w><c>.</c></s>
|
||||
|
||||
aposztrof
|
||||
IN : A don't van.
|
||||
OUT: <s><w>A</w><ws> </ws><w>don't</w><ws> </ws><w>van</w><c>.</c></s>
|
||||
|
|
@ -55,6 +55,9 @@ class TokenizerTestCase(object):
|
|||
|
||||
|
||||
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
|
||||
_HYPHEN_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_hyphen.txt"))
|
||||
_QUOTE_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_quote.txt"))
|
||||
ALL_TESTCASES = _DOTS_CASES + _HYPHEN_CASES
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -67,8 +70,8 @@ def hu_tokenizer(HU):
|
|||
return HU.tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
|
||||
@pytest.mark.parametrize(("test_case"), ALL_TESTCASES)
|
||||
def test_abbreviations(hu_tokenizer, test_case):
|
||||
tokens = hu_tokenizer(test_case.input)
|
||||
token_list = [token.orth_ for token in tokens if not token.is_space]
|
||||
assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)
|
||||
assert test_case.expected_tokens == token_list#, "{} was erronously tokenized as {}".format(test_case, token_list)
|
||||
|
|
Loading…
Reference in New Issue