From e5d426406ad3661a2863c06339f896da451d9450 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 3 Jun 2017 20:27:05 +0200
Subject: [PATCH] Add base norm exceptions

---
 spacy/lang/norm_exceptions.py | 46 +++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 spacy/lang/norm_exceptions.py

diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py
new file mode 100644
index 000000000..b02dda2c8
--- /dev/null
+++ b/spacy/lang/norm_exceptions.py
@@ -0,0 +1,46 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+# These exceptions are used to add NORM values based on a token's ORTH value.
+# Individual languages can also add their own exceptions and overwrite them -
+# for example, British vs. American spelling in English.
+
+# Norms are only set if no alternative is provided in the tokenizer exceptions.
+# Note that this does not change any other token attributes. Its main purpose
+# is to normalise the word representations so that equivalent tokens receive
+# similar representations. For example: $ and € are very different, but they're
+# both currency symbols. By normalising currency symbols to $, all symbols are
+# seen as similar, no matter how common they are in the training data.
+
+
+BASE_NORMS = {
+    "'s": "'s",
+    "'S": "'s",
+    "’s": "'s",
+    "’S": "'s",
+    "’": "'",
+    "‘": "'",
+    "´": "'",
+    "`": "'",
+    "”": '"',
+    "“": '"',
+    "''": '"',
+    "``": '"',
+    "´´": '"',
+    "„": '"',
+    "»": '"',
+    "«": '"',
+    "…": "...",
+    "—": "-",
+    "–": "-",
+    "--": "-",
+    "---": "-",
+    "€": "$",
+    "£": "$",
+    "¥": "$",
+    "฿": "$",
+    "US$": "$",
+    "C$": "$",
+    "A$": "$"
+}