First steps towards the Hungarian tokenizer code.

This commit is contained in:
Gyorgy Orosz 2016-12-07 23:07:43 +01:00
parent 5ad5408242
commit 5b00039955
6 changed files with 537 additions and 0 deletions

View File

@ -1,5 +1,6 @@
import pathlib
from spacy import hu
from .util import set_lang_class, get_lang_class
from .about import __version__
@ -24,6 +25,7 @@ set_lang_class(es.Spanish.lang, es.Spanish)
set_lang_class(pt.Portuguese.lang, pt.Portuguese)
set_lang_class(fr.French.lang, fr.French)
set_lang_class(it.Italian.lang, it.Italian)
set_lang_class(hu.Hungarian.lang, hu.Hungarian)
set_lang_class(zh.Chinese.lang, zh.Chinese)

24
spacy/hu/__init__.py Normal file
View File

@ -0,0 +1,24 @@
from __future__ import unicode_literals, print_function
from . import language_data
from ..attrs import LANG
from ..language import Language
class Hungarian(Language):
lang = 'hu'
class Defaults(Language.Defaults):
tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS)
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
stop_words = set(language_data.STOP_WORDS)

219
spacy/hu/data/stopwords.txt Normal file
View File

@ -0,0 +1,219 @@
a
abban
ahhoz
ahogy
ahol
aki
akik
akkor
akár
alatt
amely
amelyek
amelyekben
amelyeket
amelyet
amelynek
ami
amikor
amit
amolyan
amíg
annak
arra
arról
az
azok
azon
azonban
azt
aztán
azután
azzal
azért
be
belül
benne
bár
cikk
cikkek
cikkeket
csak
de
e
ebben
eddig
egy
egyes
egyetlen
egyik
egyre
egyéb
egész
ehhez
ekkor
el
ellen
elo
eloször
elott
elso
elég
előtt
emilyen
ennek
erre
ez
ezek
ezen
ezt
ezzel
ezért
fel
felé
ha
hanem
hiszen
hogy
hogyan
hát
ide
igen
ill
ill.
illetve
ilyen
ilyenkor
inkább
is
ismét
ison
itt
jobban
jól
kell
kellett
keressünk
keresztül
ki
kívül
között
közül
le
legalább
legyen
lehet
lehetett
lenne
lenni
lesz
lett
ma
maga
magát
majd
meg
mellett
mely
melyek
mert
mi
miatt
mikor
milyen
minden
mindenki
mindent
mindig
mint
mintha
mit
mivel
miért
mondta
most
már
más
másik
még
míg
nagy
nagyobb
nagyon
ne
nekem
neki
nem
nincs
néha
néhány
nélkül
o
oda
ok
oket
olyan
ott
pedig
persze
például
s
saját
sem
semmi
sok
sokat
sokkal
stb.
szemben
szerint
szinte
számára
szét
talán
te
tehát
teljes
ti
tovább
továbbá
több
túl
ugyanis
utolsó
után
utána
vagy
vagyis
vagyok
valaki
valami
valamint
való
van
vannak
vele
vissza
viszont
volna
volt
voltak
voltam
voltunk
által
általában
át
én
éppen
és
így
ön
össze
úgy
új
újabb
újra
ő
őket

271
spacy/hu/language_data.py Normal file
View File

@ -0,0 +1,271 @@
# encoding: utf8
from __future__ import unicode_literals
import os
import re
def _load_txt_data(*file_paths):
for path in file_paths:
with open(path) as f:
for line in f.readlines():
if not line.strip().startswith("#"):
yield line.strip()
_MODULE_PATH = os.path.dirname(__file__)
_ABBREVIATIONS_ORIG_PATH = _MODULE_PATH + "/data/tokenizer/abbreviations_orig-hu.txt"
_ABBREVIATIONS_NYTUD_PATH = _MODULE_PATH + "/data/tokenizer/abbreviations_nytud-hu.txt"
_STOPWORDS_PATH = _MODULE_PATH + "/data/stopwords.txt"
STOP_WORDS = set(_load_txt_data(_STOPWORDS_PATH))
TOKENIZER_PREFIXES = map(re.escape, r'''
,
"
(
[
{
*
<
>
$
£
'
``
`
#
US$
C$
A$
....
...
»
_
§
'''.strip().split('\n'))
TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
«
_
''
°
\.\.
\.\.\.
\.\.\.\.
(?<=[a-züóőúéáűíAÜÓŐÚÉÁŰÍ)\]"'´«‘’%\)²“”])\.
\-\-
´
(?<=[0-9])km²
(?<=[0-9])
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=[0-9])°C
(?<=[0-9])°K
(?<=[0-9])°F
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) '''
r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split()
ABBREVIATIONS = {abbrev: [{"F": abbrev}] for abbrev in
_load_txt_data(_ABBREVIATIONS_ORIG_PATH, _ABBREVIATIONS_NYTUD_PATH)}
TOKENIZER_EXCEPTIONS = {
"vs.": [{"F": "vs."}],
"''": [{"F": "''"}],
"": [{"F": "", "L": "--", "pos": "$,"}],
":)": [{"F": ":)"}],
"<3": [{"F": "<3"}],
";)": [{"F": ";)"}],
"(:": [{"F": "(:"}],
":(": [{"F": ":("}],
"-_-": [{"F": "-_-"}],
"=)": [{"F": "=)"}],
":/": [{"F": ":/"}],
":>": [{"F": ":>"}],
";-)": [{"F": ";-)"}],
":Y": [{"F": ":Y"}],
":P": [{"F": ":P"}],
":-P": [{"F": ":-P"}],
":3": [{"F": ":3"}],
"=3": [{"F": "=3"}],
"xD": [{"F": "xD"}],
"^_^": [{"F": "^_^"}],
"=]": [{"F": "=]"}],
"=D": [{"F": "=D"}],
"<333": [{"F": "<333"}],
":))": [{"F": ":))"}],
":0": [{"F": ":0"}],
"-__-": [{"F": "-__-"}],
"xDD": [{"F": "xDD"}],
"o_o": [{"F": "o_o"}],
"o_O": [{"F": "o_O"}],
"V_V": [{"F": "V_V"}],
"=[[": [{"F": "=[["}],
"<33": [{"F": "<33"}],
";p": [{"F": ";p"}],
";D": [{"F": ";D"}],
";-p": [{"F": ";-p"}],
";(": [{"F": ";("}],
":p": [{"F": ":p"}],
":]": [{"F": ":]"}],
":O": [{"F": ":O"}],
":-/": [{"F": ":-/"}],
":-)": [{"F": ":-)"}],
":(((": [{"F": ":((("}],
":((": [{"F": ":(("}],
":')": [{"F": ":')"}],
"(^_^)": [{"F": "(^_^)"}],
"(=": [{"F": "(="}],
"o.O": [{"F": "o.O"}],
"\")": [{"F": "\")"}],
"a.": [{"F": "a."}],
"b.": [{"F": "b."}],
"c.": [{"F": "c."}],
"d.": [{"F": "d."}],
"e.": [{"F": "e."}],
"f.": [{"F": "f."}],
"g.": [{"F": "g."}],
"h.": [{"F": "h."}],
"i.": [{"F": "i."}],
"j.": [{"F": "j."}],
"k.": [{"F": "k."}],
"l.": [{"F": "l."}],
"m.": [{"F": "m."}],
"n.": [{"F": "n."}],
"o.": [{"F": "o."}],
"p.": [{"F": "p."}],
"q.": [{"F": "q."}],
"r.": [{"F": "r."}],
"s.": [{"F": "s."}],
"t.": [{"F": "t."}],
"u.": [{"F": "u."}],
"v.": [{"F": "v."}],
"w.": [{"F": "w."}],
"x.": [{"F": "x."}],
"y.": [{"F": "y."}],
"z.": [{"F": "z."}],
}
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
TAG_MAP = {
"$(": {"pos": "PUNCT", "PunctType": "Brck"},
"$,": {"pos": "PUNCT", "PunctType": "Comm"},
"$.": {"pos": "PUNCT", "PunctType": "Peri"},
"ADJA": {"pos": "ADJ"},
"ADJD": {"pos": "ADJ", "Variant": "Short"},
"ADV": {"pos": "ADV"},
"APPO": {"pos": "ADP", "AdpType": "Post"},
"APPR": {"pos": "ADP", "AdpType": "Prep"},
"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"},
"APZR": {"pos": "ADP", "AdpType": "Circ"},
"ART": {"pos": "DET", "PronType": "Art"},
"CARD": {"pos": "NUM", "NumType": "Card"},
"FM": {"pos": "X", "Foreign": "Yes"},
"ITJ": {"pos": "INTJ"},
"KOKOM": {"pos": "CONJ", "ConjType": "Comp"},
"KON": {"pos": "CONJ"},
"KOUI": {"pos": "SCONJ"},
"KOUS": {"pos": "SCONJ"},
"NE": {"pos": "PROPN"},
"NNE": {"pos": "PROPN"},
"NN": {"pos": "NOUN"},
"PAV": {"pos": "ADV", "PronType": "Dem"},
"PROAV": {"pos": "ADV", "PronType": "Dem"},
"PDAT": {"pos": "DET", "PronType": "Dem"},
"PDS": {"pos": "PRON", "PronType": "Dem"},
"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"},
"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"},
"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"},
"PPER": {"pos": "PRON", "PronType": "Prs"},
"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"},
"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"},
"PRELAT": {"pos": "DET", "PronType": "Rel"},
"PRELS": {"pos": "PRON", "PronType": "Rel"},
"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"},
"PTKA": {"pos": "PART"},
"PTKANT": {"pos": "PART", "PartType": "Res"},
"PTKNEG": {"pos": "PART", "Negative": "Neg"},
"PTKVZ": {"pos": "PART", "PartType": "Vbp"},
"PTKZU": {"pos": "PART", "PartType": "Inf"},
"PWAT": {"pos": "DET", "PronType": "Int"},
"PWAV": {"pos": "ADV", "PronType": "Int"},
"PWS": {"pos": "PRON", "PronType": "Int"},
"TRUNC": {"pos": "X", "Hyph": "Yes"},
"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"},
"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"},
"VAINF": {"pos": "AUX", "VerbForm": "Inf"},
"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"},
"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"},
"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"},
"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"},
"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"},
"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"},
"VVINF": {"pos": "VERB", "VerbForm": "Inf"},
"VVIZU": {"pos": "VERB", "VerbForm": "Inf"},
"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"},
"XY": {"pos": "X"},
"SP": {"pos": "SPACE"}
}

View File

View File

@ -0,0 +1,21 @@
import pytest
from spacy.hu import Hungarian
@pytest.fixture(scope="session")
def HU():
return Hungarian()
@pytest.fixture(scope="module")
def hu_tokenizer(HU):
return HU.tokenizer
def test_abbreviations(hu_tokenizer):
tokens = hu_tokenizer("A vs. egy")
assert len(tokens) == 3
tokens = hu_tokenizer("A dr. egy")
assert len(tokens) == 3