diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py
index 6ee193d41..138b3afc8 100644
--- a/spacy/hu/language_data.py
+++ b/spacy/hu/language_data.py
@@ -24,34 +24,7 @@ STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH))
HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]
-TOKENIZER_PREFIXES = map(re.escape, r'''
-,
-"
-(
-[
-{
-*
-<
->
-$
-£
-„
-“
-'
-``
-`
-#
-US$
-C$
-A$
-‘
-....
-...
-‚
-»
-_
-§
-'''.strip().split('\n'))
+TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split()
TOKENIZER_SUFFIXES = r'''
,
@@ -125,11 +98,11 @@ _
(?<=[0-9])kb
'''.strip().split('\n')
-TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
+TOKENIZER_INFIXES = (r'''\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
-ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in
+ABBREVIATIONS = {abbrev: [{"ORTH": abbrev}] for abbrev in
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
TOKENIZER_EXCEPTIONS = {
diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py
deleted file mode 100644
index 4cbf1757d..000000000
--- a/spacy/tests/hu/test_tokenizer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import pytest
-
-from spacy.hu import Hungarian
-
-
-@pytest.fixture(scope="session")
-def HU():
- return Hungarian()
-
-
-@pytest.fixture(scope="module")
-def hu_tokenizer(HU):
- return HU.tokenizer
-
-
-@pytest.mark.parametrize(("input_str", "expected_length"), [
- ("A vs. egy", 3),
- ("A dr. egy", 3),
- ("A .hu egy tld.", 5),
- ("A .hu.", 3),
- ("Az egy.ketto pelda.", 4),
- ("A pl. rovidites.", 4),
- ("A S.M.A.R.T. szo.", 4)
-])
-def test_abbreviations(hu_tokenizer, input_str, expected_length):
- tokens = hu_tokenizer(input_str)
- assert len(tokens) == expected_length
diff --git a/spacy/tests/hu/tokenizer/__init__.py b/spacy/tests/hu/tokenizer/__init__.py
new file mode 100644
index 000000000..818b62e48
--- /dev/null
+++ b/spacy/tests/hu/tokenizer/__init__.py
@@ -0,0 +1,4 @@
+__author__ = 'gyorgyorosz'
+
+if __name__ == "__main__":
+ pass
diff --git a/spacy/tests/hu/tokenizer/test_default_token_dots.txt b/spacy/tests/hu/tokenizer/test_default_token_dots.txt
new file mode 100644
index 000000000..0e9ad3c65
--- /dev/null
+++ b/spacy/tests/hu/tokenizer/test_default_token_dots.txt
@@ -0,0 +1,58 @@
+# TOKEN dots
+
+0. egyszeru szavak
+IN : N. kormányzósági
+IN : székhely.
+OUT: N. kormányzósági
+OUT: székhely.
+
+
+1. szavak pontokkal
+
+1.1 mondatkozi verziok
+1.1.1 pottal kezdodo szavak
+IN : A .hu egy tld.
+OUT: A .hu egy tld.
+1.1.2 pont a szo belsejeben
+IN : Az egy.ketto pelda.
+OUT: Az egy.ketto pelda.
+1.1.3 pont a szo vegen
+IN : A pl. rovidites.
+OUT: A pl. rovidites.
+1.1.4 pontozott szo
+IN : A S.M.A.R.T. szo.
+OUT: A S.M.A.R.T. szo.
+
+1.2 mondatvegi verziok
+1.2.1 pottal kezdodo szavak
+IN : A .hu.
+OUT: A .hu.
+1.2.2 pont a szo belsejeben
+IN : Az egy.ketto.
+OUT: Az egy.ketto.
+1.2.3 pont a szo vegen
+#TODO: cf. Huntoken
+IN : A pl.
+OUT: A pl.
+1.2.4 pontozott szo
+#TODO: cf. Huntoken
+IN : A S.M.A.R.T.
+OUT: A S.M.A.R.T.
+
+
+2. tobb pont
+
+2.1 ketto vagy tobb pont utan uj szo
+IN : Egy..ket.
+OUT: Egy..ket.
+IN : Valami... van.
+OUT: Valami... van.
+IN : Valami ...van...
+OUT: Valami ...van...
+IN : Valami...
+OUT: Valami...
+IN : Valami ...
+OUT: Valami ...
+IN : Valami ... más.
+OUT: Valami ... más.
+
diff --git a/spacy/tests/hu/tokenizer/test_tokenizer.py b/spacy/tests/hu/tokenizer/test_tokenizer.py
new file mode 100644
index 000000000..12dbe5b78
--- /dev/null
+++ b/spacy/tests/hu/tokenizer/test_tokenizer.py
@@ -0,0 +1,74 @@
+import os
+import re
+
+import pytest
+
+from spacy.hu import Hungarian
+
+_MODULE_PATH = os.path.dirname(__file__)
+
+
+class TokenizerTestCase(object):
+ INPUT_PREFIX = "IN :"
+ OUTPUT_PREFIX = "OUT:"
+ WORD_PATTERN = re.compile(r"<([wc])>([^<>]+)\1>")
+
+ def __init__(self, input_str, expected_words):
+ self.input = input_str
+ self.expected_tokens = expected_words
+
+ def __repr__(self):
+ return "TokenizerTestCase".format(repr(self.input), self.expected_tokens)
+
+ def to_tuple(self):
+ return (self.input, self.expected_tokens)
+
+ @classmethod
+ def _parse_output_line(cls, line):
+ for match in cls.WORD_PATTERN.finditer(line):
+ yield match.group(2)
+
+ @classmethod
+ def read_from_file(cls, path):
+ with open(path) as f:
+ input_lines = []
+ output_words = []
+ last_type = None
+ for line in f:
+ if line.startswith(cls.INPUT_PREFIX):
+ if last_type == TokenizerTestCase.OUTPUT_PREFIX and input_lines:
+ yield TokenizerTestCase("\n".join(input_lines), output_words)
+ input_lines = []
+ output_words = []
+ input_lines.append(line[len(cls.INPUT_PREFIX):].strip())
+ last_type = TokenizerTestCase.INPUT_PREFIX
+ elif line.startswith(cls.OUTPUT_PREFIX):
+ output_words.extend(list(cls._parse_output_line(line.strip())))
+ last_type = TokenizerTestCase.OUTPUT_PREFIX
+ else:
+ # Comments separate test cases
+ if input_lines:
+ yield TokenizerTestCase("\n".join(input_lines), output_words)
+ input_lines = []
+ output_words = []
+ last_type = None
+
+
+_DOTS_CASES = list(TokenizerTestCase.read_from_file(_MODULE_PATH + "/test_default_token_dots.txt"))
+
+
+@pytest.fixture(scope="session")
+def HU():
+ return Hungarian()
+
+
+@pytest.fixture(scope="module")
+def hu_tokenizer(HU):
+ return HU.tokenizer
+
+
+@pytest.mark.parametrize(("test_case"), _DOTS_CASES)
+def test_abbreviations(hu_tokenizer, test_case):
+ tokens = hu_tokenizer(test_case.input)
+ token_list = [token.orth_ for token in tokens if not token.is_space]
+ assert test_case.expected_tokens == token_list, "{} was erronously tokenized as {}".format(test_case, token_list)