From 5b00039955a5dc259ce9e63cfe8bebc588f17585 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Wed, 7 Dec 2016 23:07:43 +0100 Subject: [PATCH] First steps towards the Hungarian tokenizer code. --- spacy/__init__.py | 2 + spacy/hu/__init__.py | 24 +++ spacy/hu/data/stopwords.txt | 219 +++++++++++++++++++++++++ spacy/hu/language_data.py | 271 +++++++++++++++++++++++++++++++ spacy/tests/hu/__init__.py | 0 spacy/tests/hu/test_tokenizer.py | 21 +++ 6 files changed, 537 insertions(+) create mode 100644 spacy/hu/__init__.py create mode 100644 spacy/hu/data/stopwords.txt create mode 100644 spacy/hu/language_data.py create mode 100644 spacy/tests/hu/__init__.py create mode 100644 spacy/tests/hu/test_tokenizer.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 68ac4c07b..09f114bc7 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,5 +1,6 @@ import pathlib +from spacy import hu from .util import set_lang_class, get_lang_class from .about import __version__ @@ -24,6 +25,7 @@ set_lang_class(es.Spanish.lang, es.Spanish) set_lang_class(pt.Portuguese.lang, pt.Portuguese) set_lang_class(fr.French.lang, fr.French) set_lang_class(it.Italian.lang, it.Italian) +set_lang_class(hu.Hungarian.lang, hu.Hungarian) set_lang_class(zh.Chinese.lang, zh.Chinese) diff --git a/spacy/hu/__init__.py b/spacy/hu/__init__.py new file mode 100644 index 000000000..39a2b6c2b --- /dev/null +++ b/spacy/hu/__init__.py @@ -0,0 +1,24 @@ +from __future__ import unicode_literals, print_function + +from . import language_data +from ..attrs import LANG +from ..language import Language + + +class Hungarian(Language): + lang = 'hu' + + class Defaults(Language.Defaults): + tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'hu' + + prefixes = tuple(language_data.TOKENIZER_PREFIXES) + + suffixes = tuple(language_data.TOKENIZER_SUFFIXES) + + infixes = tuple(language_data.TOKENIZER_INFIXES) + + tag_map = dict(language_data.TAG_MAP) + + stop_words = set(language_data.STOP_WORDS) diff --git a/spacy/hu/data/stopwords.txt b/spacy/hu/data/stopwords.txt new file mode 100644 index 000000000..0e5e775fe --- /dev/null +++ b/spacy/hu/data/stopwords.txt @@ -0,0 +1,219 @@ +a +abban +ahhoz +ahogy +ahol +aki +akik +akkor +akár +alatt +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amikor +amit +amolyan +amíg +annak +arra +arról +az +azok +azon +azonban +azt +aztán +azután +azzal +azért +be +belül +benne +bár +cikk +cikkek +cikkeket +csak +de +e +ebben +eddig +egy +egyes +egyetlen +egyik +egyre +egyéb +egész +ehhez +ekkor +el +ellen +elo +eloször +elott +elso +elég +előtt +emilyen +ennek +erre +ez +ezek +ezen +ezt +ezzel +ezért +fel +felé +ha +hanem +hiszen +hogy +hogyan +hát +ide +igen +ill +ill. +illetve +ilyen +ilyenkor +inkább +is +ismét +ison +itt +jobban +jó +jól +kell +kellett +keressünk +keresztül +ki +kívül +között +közül +le +legalább +legyen +lehet +lehetett +lenne +lenni +lesz +lett +ma +maga +magát +majd +meg +mellett +mely +melyek +mert +mi +miatt +mikor +milyen +minden +mindenki +mindent +mindig +mint +mintha +mit +mivel +miért +mondta +most +már +más +másik +még +míg +nagy +nagyobb +nagyon +ne +nekem +neki +nem +nincs +néha +néhány +nélkül +o +oda +ok +oket +olyan +ott +pedig +persze +például +rá +s +saját +sem +semmi +sok +sokat +sokkal +stb. +szemben +szerint +szinte +számára +szét +talán +te +tehát +teljes +ti +tovább +továbbá +több +túl +ugyanis +utolsó +után +utána +vagy +vagyis +vagyok +valaki +valami +valamint +való +van +vannak +vele +vissza +viszont +volna +volt +voltak +voltam +voltunk +által +általában +át +én +éppen +és +így +ön +össze +úgy +új +újabb +újra +ő +őket diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py new file mode 100644 index 000000000..692200876 --- /dev/null +++ b/spacy/hu/language_data.py @@ -0,0 +1,271 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import os +import re + + +def _load_txt_data(*file_paths): + for path in file_paths: + with open(path) as f: + for line in f.readlines(): + if not line.strip().startswith("#"): + yield line.strip() + + +_MODULE_PATH = os.path.dirname(__file__) +_ABBREVIATIONS_ORIG_PATH = _MODULE_PATH + "/data/tokenizer/abbreviations_orig-hu.txt" +_ABBREVIATIONS_NYTUD_PATH = _MODULE_PATH + "/data/tokenizer/abbreviations_nytud-hu.txt" +_STOPWORDS_PATH = _MODULE_PATH + "/data/stopwords.txt" + +STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH)) + +TOKENIZER_PREFIXES = map(re.escape, r''' +, +" +( +[ +{ +* +< +> +$ +£ +„ +“ +' +`` +` +# +US$ +C$ +A$ +‘ +.... +... +‚ +» +_ +§ +'''.strip().split('\n')) + +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +“ +« +_ +'' +’ +‘ +° +€ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\. +\-\- +´ +(?<=[0-9])km² +(?<=[0-9])m² +(?<=[0-9])cm² +(?<=[0-9])mm² +(?<=[0-9])km³ +(?<=[0-9])m³ +(?<=[0-9])cm³ +(?<=[0-9])mm³ +(?<=[0-9])ha +(?<=[0-9])km +(?<=[0-9])m +(?<=[0-9])cm +(?<=[0-9])mm +(?<=[0-9])µm +(?<=[0-9])nm +(?<=[0-9])yd +(?<=[0-9])in +(?<=[0-9])ft +(?<=[0-9])kg +(?<=[0-9])g +(?<=[0-9])mg +(?<=[0-9])µg +(?<=[0-9])t +(?<=[0-9])lb +(?<=[0-9])oz +(?<=[0-9])m/s +(?<=[0-9])km/h +(?<=[0-9])mph +(?<=[0-9])°C +(?<=[0-9])°K +(?<=[0-9])°F +(?<=[0-9])hPa +(?<=[0-9])Pa +(?<=[0-9])mbar +(?<=[0-9])mb +(?<=[0-9])T +(?<=[0-9])G +(?<=[0-9])M +(?<=[0-9])K +(?<=[0-9])kb +'''.strip().split('\n') + +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + +ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in + _load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)} + +TOKENIZER_EXCEPTIONS = { + "vs.": [{"F": "vs."}], + + "''": [{"F": "''"}], + "—": [{"F": "—", "L": "--", "pos": "$,"}], + + ":)": [{"F": ":)"}], + "<3": [{"F": "<3"}], + ";)": [{"F": ";)"}], + "(:": [{"F": "(:"}], + ":(": [{"F": ":("}], + "-_-": [{"F": "-_-"}], + "=)": [{"F": "=)"}], + ":/": [{"F": ":/"}], + ":>": [{"F": ":>"}], + ";-)": [{"F": ";-)"}], + ":Y": [{"F": ":Y"}], + ":P": [{"F": ":P"}], + ":-P": [{"F": ":-P"}], + ":3": [{"F": ":3"}], + "=3": [{"F": "=3"}], + "xD": [{"F": "xD"}], + "^_^": [{"F": "^_^"}], + "=]": [{"F": "=]"}], + "=D": [{"F": "=D"}], + "<333": [{"F": "<333"}], + ":))": [{"F": ":))"}], + ":0": [{"F": ":0"}], + "-__-": [{"F": "-__-"}], + "xDD": [{"F": "xDD"}], + "o_o": [{"F": "o_o"}], + "o_O": [{"F": "o_O"}], + "V_V": [{"F": "V_V"}], + "=[[": [{"F": "=[["}], + "<33": [{"F": "<33"}], + ";p": [{"F": ";p"}], + ";D": [{"F": ";D"}], + ";-p": [{"F": ";-p"}], + ";(": [{"F": ";("}], + ":p": [{"F": ":p"}], + ":]": [{"F": ":]"}], + ":O": [{"F": ":O"}], + ":-/": [{"F": ":-/"}], + ":-)": [{"F": ":-)"}], + ":(((": [{"F": ":((("}], + ":((": [{"F": ":(("}], + ":')": [{"F": ":')"}], + "(^_^)": [{"F": "(^_^)"}], + "(=": [{"F": "(="}], + "o.O": [{"F": "o.O"}], + "\")": [{"F": "\")"}], + + "a.": [{"F": "a."}], + "b.": [{"F": "b."}], + "c.": [{"F": "c."}], + "d.": [{"F": "d."}], + "e.": [{"F": "e."}], + "f.": [{"F": "f."}], + "g.": [{"F": "g."}], + "h.": [{"F": "h."}], + "i.": [{"F": "i."}], + "j.": [{"F": "j."}], + "k.": [{"F": "k."}], + "l.": [{"F": "l."}], + "m.": [{"F": "m."}], + "n.": [{"F": "n."}], + "o.": [{"F": "o."}], + "p.": [{"F": "p."}], + "q.": [{"F": "q."}], + "r.": [{"F": "r."}], + "s.": [{"F": "s."}], + "t.": [{"F": "t."}], + "u.": [{"F": "u."}], + "v.": [{"F": "v."}], + "w.": [{"F": "w."}], + "x.": [{"F": "x."}], + "y.": [{"F": "y."}], + "z.": [{"F": "z."}], +} + +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) + +TAG_MAP = { + "$(": {"pos": "PUNCT", "PunctType": "Brck"}, + "$,": {"pos": "PUNCT", "PunctType": "Comm"}, + "$.": {"pos": "PUNCT", "PunctType": "Peri"}, + "ADJA": {"pos": "ADJ"}, + "ADJD": {"pos": "ADJ", "Variant": "Short"}, + "ADV": {"pos": "ADV"}, + "APPO": {"pos": "ADP", "AdpType": "Post"}, + "APPR": {"pos": "ADP", "AdpType": "Prep"}, + "APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, + "APZR": {"pos": "ADP", "AdpType": "Circ"}, + "ART": {"pos": "DET", "PronType": "Art"}, + "CARD": {"pos": "NUM", "NumType": "Card"}, + "FM": {"pos": "X", "Foreign": "Yes"}, + "ITJ": {"pos": "INTJ"}, + "KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, + "KON": {"pos": "CONJ"}, + "KOUI": {"pos": "SCONJ"}, + "KOUS": {"pos": "SCONJ"}, + "NE": {"pos": "PROPN"}, + "NNE": {"pos": "PROPN"}, + "NN": {"pos": "NOUN"}, + "PAV": {"pos": "ADV", "PronType": "Dem"}, + "PROAV": {"pos": "ADV", "PronType": "Dem"}, + "PDAT": {"pos": "DET", "PronType": "Dem"}, + "PDS": {"pos": "PRON", "PronType": "Dem"}, + "PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, + "PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, + "PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, + "PPER": {"pos": "PRON", "PronType": "Prs"}, + "PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, + "PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, + "PRELAT": {"pos": "DET", "PronType": "Rel"}, + "PRELS": {"pos": "PRON", "PronType": "Rel"}, + "PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, + "PTKA": {"pos": "PART"}, + "PTKANT": {"pos": "PART", "PartType": "Res"}, + "PTKNEG": {"pos": "PART", "Negative": "Neg"}, + "PTKVZ": {"pos": "PART", "PartType": "Vbp"}, + "PTKZU": {"pos": "PART", "PartType": "Inf"}, + "PWAT": {"pos": "DET", "PronType": "Int"}, + "PWAV": {"pos": "ADV", "PronType": "Int"}, + "PWS": {"pos": "PRON", "PronType": "Int"}, + "TRUNC": {"pos": "X", "Hyph": "Yes"}, + "VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, + "VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, + "VAINF": {"pos": "AUX", "VerbForm": "Inf"}, + "VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, + "VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, + "VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, + "VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, + "VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, + "VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, + "VVINF": {"pos": "VERB", "VerbForm": "Inf"}, + "VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, + "VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, + "XY": {"pos": "X"}, + "SP": {"pos": "SPACE"} +} diff --git a/spacy/tests/hu/__init__.py b/spacy/tests/hu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py new file mode 100644 index 000000000..f1d0124c3 --- /dev/null +++ b/spacy/tests/hu/test_tokenizer.py @@ -0,0 +1,21 @@ +import pytest + +from spacy.hu import Hungarian + + +@pytest.fixture(scope="session") +def HU(): + return Hungarian() + + +@pytest.fixture(scope="module") +def hu_tokenizer(HU): + return HU.tokenizer + + +def test_abbreviations(hu_tokenizer): + tokens = hu_tokenizer("A vs. egy") + assert len(tokens) == 3 + + tokens = hu_tokenizer("A dr. egy") + assert len(tokens) == 3