Add base norm exceptions

2017-06-03 20:27:05 +02:00 · 2017-06-03 20:27:05 +02:00 · e5d426406a
parent 4c2bbc3ccc
commit e5d426406a
1 changed files with 46 additions and 0 deletions
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@ -0,0 +1,46 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# These exceptions are used to add NORM values based on a token's ORTH value.
+# Individual languages can also add their own exceptions and overwrite them -
+# for example, British vs. American spelling in English.
+
+# Norms are only set if no alternative is provided in the tokenizer exceptions.
+# Note that this does not change any other token attributes. Its main purpose
+# is to normalise the word representations so that equivalent tokens receive
+# similar representations. For example: $ and € are very different, but they're
+# both currency symbols. By normalising currency symbols to $, all symbols are
+# seen as similar, no matter how common they are in the training data.
+
+
+BASE_NORMS = {
+    "'s": "'s",
+    "'S": "'s",
+    "’s": "'s",
+    "’S": "'s",
+    "’": "'",
+    "‘": "'",
+    "´": "'",
+    "`": "'",
+    "”": '"',
+    "“": '"',
+    "''": '"',
+    "``": '"',
+    "´´": '"',
+    "„": '"',
+    "»": '"',
+    "«": '"',
+    "…": "...",
+    "—": "-",
+    "–": "-",
+    "--": "-",
+    "---": "-",
+    "€": "$",
+    "£": "$",
+    "¥": "$",
+    "฿": "$",
+    "US$": "$",
+    "C$": "$",
+    "A$": "$"
+}