mirror of https://github.com/explosion/spaCy.git
Add symbols class to punctuation rules to handle emoji (see #1088)
Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽💻 into account.
This commit is contained in:
parent
dc07d72d80
commit
a8e58e04ef
|
@ -1,8 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||
|
||||
|
||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||
|
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
|
|||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||
|
||||
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
|
||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -20,7 +20,6 @@ _upper = [_latin_upper]
|
|||
_lower = [_latin_lower]
|
||||
_uncased = [_bengali, _hebrew]
|
||||
|
||||
|
||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||
|
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
|||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||
_hyphens = '- – — -- ---'
|
||||
|
||||
_other_symbols = r'[\p{So}]'
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
QUOTES = merge_chars(_quotes)
|
||||
PUNCT = merge_chars(_punct)
|
||||
HYPHENS = merge_chars(_hyphens)
|
||||
ICONS = _other_symbols
|
||||
|
||||
LIST_UNITS = split_chars(_units)
|
||||
LIST_CURRENCY = split_chars(_currency)
|
||||
|
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
|
|||
LIST_PUNCT = split_chars(_punct)
|
||||
LIST_HYPHENS = split_chars(_hyphens)
|
||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
||||
LIST_ICONS = [_other_symbols]
|
||||
|
|
|
@ -2,15 +2,16 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from .char_classes import CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||
|
||||
|
||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
LIST_CURRENCY)
|
||||
LIST_CURRENCY + LIST_ICONS)
|
||||
|
||||
|
||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||
["'s", "'S", "’s", "’S"] +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||
|
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
|
|||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||
|
||||
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
|||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||
tokens = tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||
exceptions = ["hu"]
|
||||
tokens = tokenizer(text)
|
||||
if tokens[0].lang_ not in exceptions:
|
||||
assert len(tokens) == length
|
||||
|
|
Loading…
Reference in New Issue