mirror of https://github.com/explosion/spaCy.git
Passing Hungatian abbrev tests.
This commit is contained in:
parent
0289b8ceaa
commit
2051726fd3
|
@ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH))
|
|||
|
||||
HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
|
||||
|
||||
TOKENIZER_PREFIXES = map(re.escape, r'''
|
||||
,
|
||||
"
|
||||
(
|
||||
[
|
||||
{
|
||||
*
|
||||
<
|
||||
>
|
||||
$
|
||||
£
|
||||
„
|
||||
“
|
||||
'
|
||||
``
|
||||
`
|
||||
#
|
||||
US$
|
||||
C$
|
||||
A$
|
||||
‘
|
||||
....
|
||||
...
|
||||
‚
|
||||
»
|
||||
_
|
||||
§
|
||||
'''.strip().split('\n'))
|
||||
TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split()
|
||||
|
||||
TOKENIZER_SUFFIXES = r'''
|
||||
,
|
||||
|
@ -125,11 +98,11 @@ _
|
|||
(?<=[0-9])kb
|
||||
'''.strip().split('\n')
|
||||
|
||||
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
||||
TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
||||
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
|
||||
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
|
||||
|
||||
ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in
|
||||
ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
|
||||
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
|
|
|
@ -1,27 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from spacy.hu import Hungarian
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def HU():
|
||||
return Hungarian()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def hu_tokenizer(HU):
|
||||
return HU.tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("input_str", "expected_length"), [
|
||||
("A vs. egy", 3),
|
||||
("A dr. egy", 3),
|
||||
("A .hu egy tld.", 5),
|
||||
("A .hu.", 3),
|
||||
("Az egy.ketto pelda.", 4),
|
||||
("A pl. rovidites.", 4),
|
||||
("A S.M.A.R.T. szo.", 4)
|
||||
])
|
||||
def test_abbreviations(hu_tokenizer, input_str, expected_length):
|
||||
tokens = hu_tokenizer(input_str)
|
||||
assert len(tokens) == expected_length
|
|
@ -0,0 +1,4 @@
|
|||
__author__ = 'gyorgyorosz'
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
|
@ -0,0 +1,58 @@
|
|||
# TOKEN dots
|
||||
|
||||
0. egyszeru szavak
|
||||
IN : N. kormányzósági
|
||||
IN : székhely.
|
||||
OUT: <s><w>N.</w><ws> </ws><w>kormányzósági</w><ws>
|
||||
OUT: </ws><w>székhely</w><c>.</c></s>
|
||||
|
||||
|
||||
1. szavak pontokkal
|
||||
|
||||
1.1 mondatkozi verziok
|
||||
1.1.1 pottal kezdodo szavak
|
||||
IN : A .hu egy tld.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.hu</w><ws> </ws><w>egy</w><ws> </ws><w>tld</w><c>.</c></s>
|
||||
1.1.2 pont a szo belsejeben
|
||||
IN : Az egy.ketto pelda.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><ws> </ws><w>pelda</w><c>.</c></s>
|
||||
1.1.3 pont a szo vegen
|
||||
IN : A pl. rovidites.
|
||||
OUT: <s><w>A</w><ws> </ws><w>pl.</w><ws> </ws><w>rovidites</w><c>.</c></s>
|
||||
1.1.4 pontozott szo
|
||||
IN : A S.M.A.R.T. szo.
|
||||
OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w><ws> </ws><w>szo</w><c>.</c></s>
|
||||
|
||||
1.2 mondatvegi verziok
|
||||
1.2.1 pottal kezdodo szavak
|
||||
IN : A .hu.
|
||||
OUT: <s><w>A</w><ws> </ws><w>.hu</w><c>.</c></s>
|
||||
1.2.2 pont a szo belsejeben
|
||||
IN : Az egy.ketto.
|
||||
OUT: <s><w>Az</w><ws> </ws><w>egy.ketto</w><c>.</c></s>
|
||||
1.2.3 pont a szo vegen
|
||||
#TODO: cf. Huntoken
|
||||
IN : A pl.
|
||||
OUT: <s><w>A</w><ws> </ws><w>pl.</w></s>
|
||||
1.2.4 pontozott szo
|
||||
#TODO: cf. Huntoken
|
||||
IN : A S.M.A.R.T.
|
||||
OUT: <s><w>A</w><ws> </ws><w>S.M.A.R.T.</w></s>
|
||||
|
||||
|
||||
2. tobb pont
|
||||
|
||||
2.1 ketto vagy tobb pont utan uj szo
|
||||
IN : Egy..ket.
|
||||
OUT: <s><w>Egy</w><c>..</c><w>ket</w><c>.</c></s>
|
||||
IN : Valami... van.
|
||||
OUT: <s><w>Valami</w><c>...</c><ws> </ws><w>van</w><c>.</c></s>
|
||||
IN : Valami ...van...
|
||||
OUT: <s><w>Valami</w><ws> </ws><c>...</c><w>van</w><c>...</c></s>
|
||||
IN : Valami...
|
||||
OUT: <s><w>Valami</w><c>...</c></s>
|
||||
IN : Valami ...
|
||||
OUT: <s><w>Valami</w><ws> </ws><c>...</c></s>
|
||||
IN : Valami ... más.
|
||||
OUT: <s><w>Valami</w><ws> </ws><c>...</c><ws> </ws><w>más</w><c>.</c></s>
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from spacy.hu import Hungarian
|
||||
|
||||
_MODULE_PATH = os.path.dirname(__file__)
|
||||
|
||||
|
||||
class TokenizerTestCase(object):
|
||||
INPUT_PREFIX = "IN :"
|
||||
OUTPUT_PREFIX = "OUT:"
|
||||
WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)</\1>")
|
||||
|
||||
def __init__(self, input_str, expected_words):
|
||||
self.input = input_str
|
||||
self.expected_tokens = expected_words
|
||||
|
||||
def __repr__(self):
|
||||
return "TokenizerTestCase<input={}, words={}>".format(repr(self.input), self.expected_tokens)
|
||||
|
||||
def to_tuple(self):
|
||||
return (self.input, self.expected_tokens)
|
||||
|
||||
@classmethod
|
||||
def _parse_output_line(cls, line):
|
||||
for match in cls.WORD_PATTERN.finditer(line):
|
||||
yield match.group(2)
|
||||
|
||||
@classmethod
|
||||
def read_from_file(cls, path):
|
||||
with open(path) as f:
|
||||
input_lines = []
|
||||
output_words = []
|
||||
last_type = None
|
||||
for line in f:
|
||||
if line.startswith(cls.INPUT_PREFIX):
|
||||
if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
|
||||
yield TokenizerTestCase("\n".join(input_lines), output_words)
|
||||
input_lines = []
|
||||
output_words = []
|
||||
input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
|
||||
last_type = TokenizerTestCase.INPUT_PREFIX
|
||||
elif line.startswith(cls.OUTPUT_PREFIX):
|
||||
output_words.extend(list(cls._parse_output_line(line.strip())))
|
||||
last_type = TokenizerTestCase.OUTPUT_PREFIX
|
||||
else:
|
||||
# Comments separate test cases
|
||||
if input_lines:
|
||||
yield TokenizerTestCase("\n".join(input_lines), output_words)
|
||||
input_lines = []
|
||||
output_words = []
|
||||
last_type = None
|
||||
|
||||
|
||||
_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def HU():
|
||||
return Hungarian()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def hu_tokenizer(HU):
|
||||
return HU.tokenizer
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
|
||||
def test_abbreviations(hu_tokenizer, test_case):
|
||||
tokens = hu_tokenizer(test_case.input)
|
||||
token_list = [token.orth_ for token in tokens if not token.is_space]
|
||||
assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)
|
Loading…
Reference in New Issue