Merge base tokenizer exceptions

This commit is contained in:
ines 2017-05-08 15:55:52 +02:00
parent 24606d364c
commit e7f95c37ee
3 changed files with 160 additions and 192 deletions

View File

@ -1,43 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
ABBREVIATIONS = [
"'",
"\\\")",
"<space>",
"''",
"C++",
"a.",
"b.",
"c.",
"d.",
"e.",
"f.",
"g.",
"h.",
"i.",
"j.",
"k.",
"l.",
"m.",
"n.",
"o.",
"p.",
"q.",
"r.",
"s.",
"t.",
"u.",
"v.",
"w.",
"x.",
"y.",
"z.",
"ä.",
"ö.",
"ü."
]
__all__ = [ "ABBREVIATIONS" ]

View File

@ -1,148 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
EMOTICONS = set("""
:)
:-)
:))
:-))
:)))
:-)))
(:
(-:
=)
(=
")
:]
:-]
[:
[-:
:o)
(o:
:}
:-}
8)
8-)
(-8
;)
;-)
(;
(-;
:(
:-(
:((
:-((
:(((
:-(((
):
)-:
=(
>:(
:')
:'-)
:'(
:'-(
:/
:-/
=/
=|
:|
:-|
:1
:P
:-P
:p
:-p
:O
:-O
:o
:-o
:0
:-0
:()
>:o
:*
:-*
:3
:-3
=3
:>
:->
:X
:-X
:x
:-x
:D
:-D
;D
;-D
=D
xD
XD
xDD
XDD
8D
8-D
^_^
^__^
^___^
>.<
>.>
<.<
._.
;_;
-_-
-__-
v.v
V.V
v_v
V_V
o_o
o_O
O_o
O_O
0_o
o_0
0_0
o.O
O.o
O.O
o.o
0.0
o.0
0.o
@_@
<3
<33
<333
</3
(^_^)
(-_-)
(._.)
(>_<)
(*_*)
(¬_¬)
ಠ_ಠ
(ಠ_ಠ)
¯\()/¯
(°°
><(((*>
""".split())
__all__ = [ "EMOTICONS" ]

View File

@ -1,9 +1,13 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
# The use of this module turns out to be important, to avoid pathological # The use of this module turns out to be important, to avoid pathological
# back-tracking. See Issue #957 # back-tracking. See Issue #957
import regex import regex
from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# A few minor mods to this regex to account for use cases represented in test_urls # A few minor mods to this regex to account for use cases represented in test_urls
_URL_PATTERN = ( _URL_PATTERN = (
@ -51,4 +55,159 @@ _URL_PATTERN = (
TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match TOKEN_MATCH = regex.compile(_URL_PATTERN, regex.UNICODE).match
__all__ = ['TOKEN_MATCH']
BASE_EXCEPTIONS = {}
for exc_data in [
{ORTH: " ", POS: SPACE},
{ORTH: "\t", POS: SPACE},
{ORTH: "\\t", POS: SPACE},
{ORTH: "\n", POS: SPACE},
{ORTH: "\\n", POS: SPACE},
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]:
BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
for orth in [
"'", "\\\")", "<space>", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.",
"g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.",
"s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]:
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
emoticons = set("""
:)
:-)
:))
:-))
:)))
:-)))
(:
(-:
=)
(=
")
:]
:-]
[:
[-:
:o)
(o:
:}
:-}
8)
8-)
(-8
;)
;-)
(;
(-;
:(
:-(
:((
:-((
:(((
:-(((
):
)-:
=(
>:(
:')
:'-)
:'(
:'-(
:/
:-/
=/
=|
:|
:-|
:1
:P
:-P
:p
:-p
:O
:-O
:o
:-o
:0
:-0
:()
>:o
:*
:-*
:3
:-3
=3
:>
:->
:X
:-X
:x
:-x
:D
:-D
;D
;-D
=D
xD
XD
xDD
XDD
8D
8-D
^_^
^__^
^___^
>.<
>.>
<.<
._.
;_;
-_-
-__-
v.v
V.V
v_v
V_V
o_o
o_O
O_o
O_O
0_o
o_0
0_0
o.O
O.o
O.O
o.o
0.0
o.0
0.o
@_@
<3
<33
<333
</3
(^_^)
(-_-)
(._.)
(>_<)
(*_*)
(¬_¬)
ಠ_ಠ
(ಠ_ಠ)
¯\()/¯
(°°
><(((*>
""".split())
for orth in emoticons:
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]