diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 0cb4ffd38..49652c5ac 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -4,22 +4,25 @@ from __future__ import unicode_literals import six from spacy.language_data import strings_to_exc, update_exc -from .punctuations import * +from .punctuation import * from .stop_words import STOP_WORDS from .tokenizer_exceptions import ABBREVIATIONS from .tokenizer_exceptions import OTHER_EXC from .. import language_data as base -STOP_WORDS = set(STOP_WORDS) -TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES -TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES -TOKENIZER_INFIXES = TOKENIZER_INFIXES -# HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] +STOP_WORDS = set(STOP_WORDS) + TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) + +TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES +TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES +TOKENIZER_INFIXES = TOKENIZER_INFIXES + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py new file mode 100644 index 000000000..e28052fd3 --- /dev/null +++ b/spacy/hu/punctuation.py @@ -0,0 +1,25 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES + + +TOKENIZER_SUFFIXES = [ + r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) +] + +TOKENIZER_INFIXES = [ + r'(?<=[0-9])-(?=[0-9])', + r'(?<=[0-9])[+\-\*/^](?=[0-9])', + r'(?<=[{a}])--(?=[{a}])', + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) +] + + +TOKENIZER_INFIXES += LIST_ELLIPSES + + +__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuations.py b/spacy/hu/punctuations.py deleted file mode 100644 index 3681a2fbe..000000000 --- a/spacy/hu/punctuations.py +++ /dev/null @@ -1,89 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - -TOKENIZER_PREFIXES = r''' -+ -'''.strip().split('\n') - -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -\$ -> -: -; -' -” -“ -« -_ -'' -’ -‘ -€ -\.\. -\.\.\. -\.\.\.\. -(?<=[a-züóőúéáűí)\]"'´«‘’%\)²“”+-])\. -(?<=[a-züóőúéáűí)])-e -\-\- -´ -(?<=[0-9])\+ -(?<=[a-z0-9üóőúéáűí][\)\]”"'%\)§/])\. -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=°[FCK])\. -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') - -TOKENIZER_INFIXES = r''' -… -\.\.+ -(?<=[a-züóőúéáűí])\.(?=[A-ZÜÓŐÚÉÁŰÍ]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ0-9])"(?=[\-a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ])--(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -(?<=[0-9])[+\-\*/^](?=[0-9]) -(?<=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]),(?=[a-zA-ZüóőúéáűíÜÓŐÚÉÁŰÍ]) -'''.strip().split('\n') - -__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]