From 7bfe2d4abcbd3ec60935514ecf9f12e470562fab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 8 Dec 2016 20:41:41 +0100 Subject: [PATCH] Update Portuguese language data --- spacy/pt/__init__.py | 38 ++-- spacy/pt/language_data.py | 369 +++----------------------------------- 2 files changed, 48 insertions(+), 359 deletions(-) diff --git a/spacy/pt/__init__.py b/spacy/pt/__init__.py index a991ef7ae..33c7b3aa7 100644 --- a/spacy/pt/__init__.py +++ b/spacy/pt/__init__.py @@ -3,25 +3,37 @@ from __future__ import unicode_literals, print_function from os import path from ..language import Language -from ..attrs import LANG from . import language_data +from ..attrs import LANG +from ..util import update_exc + +from ..language_data import EMOTICONS +from .language_data import ORTH_ONLY +from .language_data import strings_to_exc + + +TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) +TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) +TOKENIZER_SUFFIXES = tuple(language_data.TOKENIZER_SUFFIXES) +TOKENIZER_INFIXES = tuple(language_data.TOKENIZER_INFIXES) +TAG_MAP = dict(language_data.TAG_MAP) +STOP_WORDS = set(language_data.STOP_WORDS) + + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) class Portuguese(Language): lang = 'pt' - + class Defaults(Language.Defaults): - tokenizer_exceptions = dict(language_data.TOKENIZER_EXCEPTIONS) lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'pt' - - prefixes = tuple(language_data.TOKENIZER_PREFIXES) - - suffixes = tuple(language_data.TOKENIZER_SUFFIXES) - - infixes = tuple(language_data.TOKENIZER_INFIXES) - - tag_map = dict(language_data.TAG_MAP) - - stop_words = set(language_data.STOP_WORDS) + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + prefixes = TOKENIZER_PREFIXES + suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES + tag_map = TAG_MAP + stop_words = STOP_WORDS diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index 417cd9828..c4c45c027 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -1,356 +1,33 @@ # encoding: utf8 from __future__ import unicode_literals -import re + +from ..symbols import * +from ..language_data import TOKENIZER_PREFIXES +from ..language_data import TOKENIZER_SUFFIXES +from ..language_data import TOKENIZER_INFIXES -STOP_WORDS = set() +def strings_to_exc(orths): + return {orth: [{ORTH: orth}] for orth in orths} -TOKENIZER_PREFIXES = map(re.escape, r''' -, -" -( -[ -{ -* -< -> -$ -£ -„ -“ -' -`` -` -# -US$ -C$ -A$ -a- -‘ -.... -... -‚ -» -_ -§ -'''.strip().split('\n')) - - -TOKENIZER_SUFFIXES = r''' -, -\" -\) -\] -\} -\* -\! -\? -% -\$ -> -: -; -' -” -“ -« -_ -'' -'s -'S -’s -’S -’ -‘ -° -€ -\.\. -\.\.\. -\.\.\.\. -(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\. -\-\- -´ -(?<=[0-9])km² -(?<=[0-9])m² -(?<=[0-9])cm² -(?<=[0-9])mm² -(?<=[0-9])km³ -(?<=[0-9])m³ -(?<=[0-9])cm³ -(?<=[0-9])mm³ -(?<=[0-9])ha -(?<=[0-9])km -(?<=[0-9])m -(?<=[0-9])cm -(?<=[0-9])mm -(?<=[0-9])µm -(?<=[0-9])nm -(?<=[0-9])yd -(?<=[0-9])in -(?<=[0-9])ft -(?<=[0-9])kg -(?<=[0-9])g -(?<=[0-9])mg -(?<=[0-9])µg -(?<=[0-9])t -(?<=[0-9])lb -(?<=[0-9])oz -(?<=[0-9])m/s -(?<=[0-9])km/h -(?<=[0-9])mph -(?<=[0-9])°C -(?<=[0-9])°K -(?<=[0-9])°F -(?<=[0-9])hPa -(?<=[0-9])Pa -(?<=[0-9])mbar -(?<=[0-9])mb -(?<=[0-9])T -(?<=[0-9])G -(?<=[0-9])M -(?<=[0-9])K -(?<=[0-9])kb -'''.strip().split('\n') - - -TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' - r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' - r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() - - - -TOKENIZER_EXCEPTIONS = { - "vs.": [{"F": "vs."}], - - "''": [{"F": "''"}], - "—": [{"F": "—", "L": "--", "pos": "$,"}], - - "a.m.": [{"F": "a.m."}], - "p.m.": [{"F": "p.m."}], - - "1a.m.": [{"F": "1"}, {"F": "a.m."}], - "2a.m.": [{"F": "2"}, {"F": "a.m."}], - "3a.m.": [{"F": "3"}, {"F": "a.m."}], - "4a.m.": [{"F": "4"}, {"F": "a.m."}], - "5a.m.": [{"F": "5"}, {"F": "a.m."}], - "6a.m.": [{"F": "6"}, {"F": "a.m."}], - "7a.m.": [{"F": "7"}, {"F": "a.m."}], - "8a.m.": [{"F": "8"}, {"F": "a.m."}], - "9a.m.": [{"F": "9"}, {"F": "a.m."}], - "10a.m.": [{"F": "10"}, {"F": "a.m."}], - "11a.m.": [{"F": "11"}, {"F": "a.m."}], - "12a.m.": [{"F": "12"}, {"F": "a.m."}], - "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], - "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], - "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], - "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], - "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], - "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], - "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], - "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], - "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], - "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], - "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], - "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], - - "p.m.": [{"F": "p.m."}], - "1p.m.": [{"F": "1"}, {"F": "p.m."}], - "2p.m.": [{"F": "2"}, {"F": "p.m."}], - "3p.m.": [{"F": "3"}, {"F": "p.m."}], - "4p.m.": [{"F": "4"}, {"F": "p.m."}], - "5p.m.": [{"F": "5"}, {"F": "p.m."}], - "6p.m.": [{"F": "6"}, {"F": "p.m."}], - "7p.m.": [{"F": "7"}, {"F": "p.m."}], - "8p.m.": [{"F": "8"}, {"F": "p.m."}], - "9p.m.": [{"F": "9"}, {"F": "p.m."}], - "10p.m.": [{"F": "10"}, {"F": "p.m."}], - "11p.m.": [{"F": "11"}, {"F": "p.m."}], - "12p.m.": [{"F": "12"}, {"F": "p.m."}], - "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], - "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], - "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], - "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], - "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], - "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], - "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], - "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], - "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], - "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], - "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], - "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], - - "Ala.": [{"F": "Ala."}], - "Ariz.": [{"F": "Ariz."}], - "Ark.": [{"F": "Ark."}], - "Calif.": [{"F": "Calif."}], - "Colo.": [{"F": "Colo."}], - "Conn.": [{"F": "Conn."}], - "Del.": [{"F": "Del."}], - "D.C.": [{"F": "D.C."}], - "Fla.": [{"F": "Fla."}], - "Ga.": [{"F": "Ga."}], - "Ill.": [{"F": "Ill."}], - "Ind.": [{"F": "Ind."}], - "Kans.": [{"F": "Kans."}], - "Kan.": [{"F": "Kan."}], - "Ky.": [{"F": "Ky."}], - "La.": [{"F": "La."}], - "Md.": [{"F": "Md."}], - "Mass.": [{"F": "Mass."}], - "Mich.": [{"F": "Mich."}], - "Minn.": [{"F": "Minn."}], - "Miss.": [{"F": "Miss."}], - "Mo.": [{"F": "Mo."}], - "Mont.": [{"F": "Mont."}], - "Nebr.": [{"F": "Nebr."}], - "Neb.": [{"F": "Neb."}], - "Nev.": [{"F": "Nev."}], - "N.H.": [{"F": "N.H."}], - "N.J.": [{"F": "N.J."}], - "N.M.": [{"F": "N.M."}], - "N.Y.": [{"F": "N.Y."}], - "N.C.": [{"F": "N.C."}], - "N.D.": [{"F": "N.D."}], - "Okla.": [{"F": "Okla."}], - "Ore.": [{"F": "Ore."}], - "Pa.": [{"F": "Pa."}], - "Tenn.": [{"F": "Tenn."}], - "Va.": [{"F": "Va."}], - "Wash.": [{"F": "Wash."}], - "Wis.": [{"F": "Wis."}], - - ":)": [{"F": ":)"}], - "<3": [{"F": "<3"}], - ";)": [{"F": ";)"}], - "(:": [{"F": "(:"}], - ":(": [{"F": ":("}], - "-_-": [{"F": "-_-"}], - "=)": [{"F": "=)"}], - ":/": [{"F": ":/"}], - ":>": [{"F": ":>"}], - ";-)": [{"F": ";-)"}], - ":Y": [{"F": ":Y"}], - ":P": [{"F": ":P"}], - ":-P": [{"F": ":-P"}], - ":3": [{"F": ":3"}], - "=3": [{"F": "=3"}], - "xD": [{"F": "xD"}], - "^_^": [{"F": "^_^"}], - "=]": [{"F": "=]"}], - "=D": [{"F": "=D"}], - "<333": [{"F": "<333"}], - ":))": [{"F": ":))"}], - ":0": [{"F": ":0"}], - "-__-": [{"F": "-__-"}], - "xDD": [{"F": "xDD"}], - "o_o": [{"F": "o_o"}], - "o_O": [{"F": "o_O"}], - "V_V": [{"F": "V_V"}], - "=[[": [{"F": "=[["}], - "<33": [{"F": "<33"}], - ";p": [{"F": ";p"}], - ";D": [{"F": ";D"}], - ";-p": [{"F": ";-p"}], - ";(": [{"F": ";("}], - ":p": [{"F": ":p"}], - ":]": [{"F": ":]"}], - ":O": [{"F": ":O"}], - ":-/": [{"F": ":-/"}], - ":-)": [{"F": ":-)"}], - ":(((": [{"F": ":((("}], - ":((": [{"F": ":(("}], - ":')": [{"F": ":')"}], - "(^_^)": [{"F": "(^_^)"}], - "(=": [{"F": "(="}], - "o.O": [{"F": "o.O"}], - "\")": [{"F": "\")"}], - - "a.": [{"F": "a."}], - "b.": [{"F": "b."}], - "c.": [{"F": "c."}], - "d.": [{"F": "d."}], - "e.": [{"F": "e."}], - "f.": [{"F": "f."}], - "g.": [{"F": "g."}], - "h.": [{"F": "h."}], - "i.": [{"F": "i."}], - "j.": [{"F": "j."}], - "k.": [{"F": "k."}], - "l.": [{"F": "l."}], - "m.": [{"F": "m."}], - "n.": [{"F": "n."}], - "o.": [{"F": "o."}], - "p.": [{"F": "p."}], - "q.": [{"F": "q."}], - "r.": [{"F": "r."}], - "s.": [{"F": "s."}], - "t.": [{"F": "t."}], - "u.": [{"F": "u."}], - "v.": [{"F": "v."}], - "w.": [{"F": "w."}], - "x.": [{"F": "x."}], - "y.": [{"F": "y."}], - "z.": [{"F": "z."}], -} +PRON_LEMMA = "-PRON-" TAG_MAP = { -"$(": {"pos": "PUNCT", "PunctType": "Brck"}, -"$,": {"pos": "PUNCT", "PunctType": "Comm"}, -"$.": {"pos": "PUNCT", "PunctType": "Peri"}, -"ADJA": {"pos": "ADJ"}, -"ADJD": {"pos": "ADJ", "Variant": "Short"}, -"ADV": {"pos": "ADV"}, -"APPO": {"pos": "ADP", "AdpType": "Post"}, -"APPR": {"pos": "ADP", "AdpType": "Prep"}, -"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, -"APZR": {"pos": "ADP", "AdpType": "Circ"}, -"ART": {"pos": "DET", "PronType": "Art"}, -"CARD": {"pos": "NUM", "NumType": "Card"}, -"FM": {"pos": "X", "Foreign": "Yes"}, -"ITJ": {"pos": "INTJ"}, -"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, -"KON": {"pos": "CONJ"}, -"KOUI": {"pos": "SCONJ"}, -"KOUS": {"pos": "SCONJ"}, -"NE": {"pos": "PROPN"}, -"NNE": {"pos": "PROPN"}, -"NN": {"pos": "NOUN"}, -"PAV": {"pos": "ADV", "PronType": "Dem"}, -"PROAV": {"pos": "ADV", "PronType": "Dem"}, -"PDAT": {"pos": "DET", "PronType": "Dem"}, -"PDS": {"pos": "PRON", "PronType": "Dem"}, -"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, -"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, -"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, -"PPER": {"pos": "PRON", "PronType": "Prs"}, -"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, -"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, -"PRELAT": {"pos": "DET", "PronType": "Rel"}, -"PRELS": {"pos": "PRON", "PronType": "Rel"}, -"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, -"PTKA": {"pos": "PART"}, -"PTKANT": {"pos": "PART", "PartType": "Res"}, -"PTKNEG": {"pos": "PART", "Negative": "Neg"}, -"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, -"PTKZU": {"pos": "PART", "PartType": "Inf"}, -"PWAT": {"pos": "DET", "PronType": "Int"}, -"PWAV": {"pos": "ADV", "PronType": "Int"}, -"PWS": {"pos": "PRON", "PronType": "Int"}, -"TRUNC": {"pos": "X", "Hyph": "Yes"}, -"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, -"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, -"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, -"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, -"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, -"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, -"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, -"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, -"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, -"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, -"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, -"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, -"XY": {"pos": "X"}, -"SP": {"pos": "SPACE"} + +} + +STOP_WORDS = set(""" + +""".split()) + + +TOKENIZER_EXCEPTIONS = { + +} + + +ORTH_ONLY = { + }