From 7558f1aeefe1a821314d857c73401357202ad342 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Nov 2016 21:27:52 +0100 Subject: [PATCH 1/5] Ignore temporary files and allow Dropbox workaround --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b8a4a2fec..da7dde60c 100644 --- a/.gitignore +++ b/.gitignore @@ -93,6 +93,9 @@ coverage.xml # Mac OS X *.DS_Store +# Temporary files / Dropbox hack +*.~* + # Komodo project files *.komodoproject From de747e39e7e0aad3b27896488cd76fcc883d8e62 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Nov 2016 13:51:32 +0100 Subject: [PATCH 2/5] Reformat language data --- spacy/de/language_data.py | 10 +-- spacy/en/language_data.py | 148 ++++++++++++++++++++++++++++---------- 2 files changed, 117 insertions(+), 41 deletions(-) diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index 92ec3e9c7..858bda353 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -82,7 +82,7 @@ zurück zusammen zwanzig zwar zwei zweite zweiten zweiter zweites zwischen """.split()) -TOKENIZER_PREFIXES = map(re.escape, r''' +TOKENIZER_PREFIXES = r''' , " ( @@ -106,11 +106,11 @@ a- ‘ .... ... +… ‚ » -_ § -'''.strip().split('\n')) +'''.strip().split('\n') TOKENIZER_SUFFIXES = r''' @@ -141,6 +141,7 @@ _ ‘ ° € +… \.\. \.\.\. \.\.\.\. @@ -191,7 +192,8 @@ _ TOKENIZER_INFIXES = r''' -\.\.\. +… +\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ]) (?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ]) diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 52a413ec6..351870fed 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -1,39 +1,57 @@ # encoding: utf8 from __future__ import unicode_literals +import re # improved list from Stone, Denis, Kwantes (2010) STOP_WORDS = set(""" -a about above across after afterwards again against all almost alone -along already also although always am among amongst amoungst amount -an and another any anyhow anyone anything anyway anywhere are around -as at back be became because become becomes becoming been before -beforehand behind being below beside besides between beyond bill -both bottom but by call can cannot cant co computer con could couldnt -cry de describe detail did didn do does doesn doing don done down due -during each eg eight either eleven else elsewhere empty enough etc -even ever every everyone everything everywhere except few fifteen -fify fill find fire first five for former formerly forty found four -from front full further get give go had has hasnt have he hence her -here hereafter hereby herein hereupon hers herself him himself his -how however hundred i ie if in inc indeed interest into is it its -itself keep last latter latterly least less ltd just kg km made make -many may me meanwhile might mill mine more moreover most mostly move -much must my myself name namely neither never nevertheless next nine -no nobody none noone nor not nothing now nowhere of off often on once -one only onto or other others otherwise our ours ourselves out over -own part per perhaps please put rather re quite rather really regarding -same say see seem seemed seeming seems serious several she should -show side since sincere six sixty so some somehow someone something -sometime sometimes somewhere still such system take ten than that the -their them themselves then thence there thereafter thereby therefore -therein thereupon these they thick thin third this those though three -through throughout thru thus to together too top toward towards twelve -twenty two un under until up unless upon us used using various very -very via was we well were what whatever when whence whenever where whereafter -whereas whereby wherein whereupon wherever whether which while whither -who whoever whole whom whose why will with within without would yet you -your yours yourself yourselves + +a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow anyone anything anyway anywhere are around as at + +back be became because become becomes becoming been before beforehand behind being below beside besides between beyond both bottom but by + +call can cannot ca could + +did do does doing done down due during + +each eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except + +few fifteen fifty first five for former formerly forty four from front full further + +get give go + +had has have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred + +i if in inc indeed into is it its itself + +keep + +last latter latterly least less + +just + +made make many may me meanwhile might mine more moreover most mostly move much must my myself + +name namely neither never nevertheless next nine no nobody none noone nor not nothing now nowhere + +of off often on once one only onto or other others otherwise our ours ourselves out over own + +part per perhaps please put + +quite + +rather re really regarding + +same say see seem seemed seeming seems serious several she should show side since six sixty so some somehow someone something sometime sometimes somewhere still such + +take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they third this those though three through throughout thru thus to together too top toward towards twelve twenty two + +under until up unless upon us used using + +various very very via was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would + +yet you your yours yourself yourselves + """.split()) @@ -98,17 +116,73 @@ TAG_MAP = { "HVS": {"pos": "verb"} } -TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... ...'''.split() +TOKENIZER_PREFIXES = r''' +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +€ +a- +‘ +.... +... +… +'''.strip().split('\n') -TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ' ” '' 's 'S ’s ’S ’''' - r'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]”"'%\)])\. ''' - r'''(?<=[0-9])km''').strip().split() +TOKENIZER_SUFFIXES = r''' +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +… +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]”"'%\)])\. +(?<=[0-9])km +'''.strip().split('\n') -TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' - r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' - r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() +TOKENIZER_INFIXES = r''' +… +\.\.\.+ +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) +(?<=[a-zA-Z])--(?=[a-zA-z]) +(?<=[0-9])-(?=[0-9]) +(?<=[A-Za-z]),(?=[A-Za-z]) +'''.strip().split('\n') TOKENIZER_EXCEPTIONS = { From 6247c005a27a813f6a2d4bf09a3836353f555241 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Nov 2016 13:51:59 +0100 Subject: [PATCH 3/5] Add test for tokenizer regular expressions --- spacy/tests/tokenizer/test_tokenizer.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 9b17a01c6..d4330e3ce 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -7,6 +7,10 @@ import pickle import cloudpickle import tempfile +from ... import util +from ...en.language_data import TOKENIZER_PREFIXES as EN_TOKENIZER_PREFIXES + +en_search_prefixes = util.compile_prefix_regex(EN_TOKENIZER_PREFIXES).search # @pytest.mark.xfail # def test_pickle(en_tokenizer): @@ -16,6 +20,10 @@ import tempfile # loaded = pickle.load(file_) # assert loaded is not None +def test_pre_punct_regex(): + string = "(can't" + match = en_search_prefixes(string) + assert match.group() == "(" def test_no_word(en_tokenizer): tokens = en_tokenizer(u'') @@ -57,10 +65,9 @@ def test_contraction(en_tokenizer): assert len(tokens) == 5 assert tokens[4].orth == en_tokenizer.vocab['!'].orth - def test_contraction_punct(en_tokenizer): - tokens = en_tokenizer("(can't") - assert len(tokens) == 3 + tokens = [w.text for w in en_tokenizer("(can't")] + assert tokens == ['(', 'ca', "n't"] tokens = en_tokenizer("`ain't") assert len(tokens) == 3 tokens = en_tokenizer('''"isn't''') From 4dcfafde0260e01e13e0d8675f27ab4d26be57d8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Nov 2016 14:57:37 +0100 Subject: [PATCH 4/5] Add line breaks --- spacy/en/language_data.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index 351870fed..3015b3f97 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -6,21 +6,27 @@ import re # improved list from Stone, Denis, Kwantes (2010) STOP_WORDS = set(""" -a about above across after afterwards again against all almost alone along already also although always am among amongst amount an and another any anyhow anyone anything anyway anywhere are around as at +a about above across after afterwards again against all almost alone along +already also although always am among amongst amount an and another any anyhow +anyone anything anyway anywhere are around as at -back be became because become becomes becoming been before beforehand behind being below beside besides between beyond both bottom but by +back be became because become becomes becoming been before beforehand behind +being below beside besides between beyond both bottom but by call can cannot ca could did do does doing done down due during -each eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except +each eight either eleven else elsewhere empty enough etc even ever every +everyone everything everywhere except -few fifteen fifty first five for former formerly forty four from front full further +few fifteen fifty first five for former formerly forty four from front full +further get give go -had has have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred +had has have he hence her here hereafter hereby herein hereupon hers herself +him himself his how however hundred i if in inc indeed into is it its itself @@ -30,11 +36,14 @@ last latter latterly least less just -made make many may me meanwhile might mine more moreover most mostly move much must my myself +made make many may me meanwhile might mine more moreover most mostly move much +must my myself -name namely neither never nevertheless next nine no nobody none noone nor not nothing now nowhere +name namely neither never nevertheless next nine no nobody none noone nor not +nothing now nowhere -of off often on once one only onto or other others otherwise our ours ourselves out over own +of off often on once one only onto or other others otherwise our ours ourselves +out over own part per perhaps please put @@ -42,13 +51,20 @@ quite rather re really regarding -same say see seem seemed seeming seems serious several she should show side since six sixty so some somehow someone something sometime sometimes somewhere still such +same say see seem seemed seeming seems serious several she should show side +since six sixty so some somehow someone something sometime sometimes somewhere +still such -take ten than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they third this those though three through throughout thru thus to together too top toward towards twelve twenty two +take ten than that the their them themselves then thence there thereafter +thereby therefore therein thereupon these they third this those though three +through throughout thru thus to together too top toward towards twelve twenty +two under until up unless upon us used using -various very very via was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would +various very very via was we well were what whatever when whence whenever where +whereafter whereas whereby wherein whereupon wherever whether which while +whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves From d21ad01840b927d5ba8c5cc4d53c2b2c45a3767b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Nov 2016 19:13:00 +0100 Subject: [PATCH 5/5] Add emoticons --- spacy/language_data/__init__.py | 0 spacy/language_data/emoticons.py | 132 +++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 spacy/language_data/__init__.py create mode 100644 spacy/language_data/emoticons.py diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/language_data/emoticons.py b/spacy/language_data/emoticons.py new file mode 100644 index 000000000..d968f7c98 --- /dev/null +++ b/spacy/language_data/emoticons.py @@ -0,0 +1,132 @@ +EMOTICONS = set(""" + +:) +:-) +:)) +:-)) +:))) +:-))) +(: +(-: +=) +(= +:] +:-] +[: +[-: +:o) +(o: +:} +:-} +8) +8-) +(-8 + +;) +;-) +(; +(-; + +:( +:-( +:(( +:-(( +:((( +:-((( +): +)-: +=( + +:') +:'-) +:'( +:'-( + +:/ +:-/ +=/ +:| +:-| + +:P +:-P +:p +:-p + +:O +:-O +:o +:-o +:0 +:-0 +:() + +:* +:-* +:3 +:-3 +=3 +:> +:-> + +:X +:-X +:x +:-x + +:D +:-D +;D +;-D +=D +xD +XD +xDD +XDD +8D +8-D + +^_^ +^__^ +^___^ +>.< +>.> +<.< +._. +;_; +-_- +v.v +V.V +v_v +V_V +o_o +o_O +O_o +O_O +0_o +o_0 +0_0 + +<3 +<33 +<333 +_<) +(*_*) +(¬_¬) + +ಠ_ಠ +ಠ︵ಠ +(ಠ_ಠ) +¯\(ツ)/¯ +(╯°□°)╯︵┻━┻ +><(((*> + +""".split()) + + +__all__ = [ EMOTICONS ]