spaCy/spacy/lang/norm_exceptions.py

58 lines
1.4 KiB
Python
Raw Normal View History

2017-06-03 18:27:05 +00:00
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": '"',
"": '"',
"": "?",
"": "!",
"": ",",
"": ";",
"": ":",
"": ".",
"": ".",
2017-06-03 18:27:05 +00:00
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"——": "-",
2017-06-03 18:27:05 +00:00
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$",
"" : "$",
2017-06-03 18:27:05 +00:00
}