From 1842a53e73405be3048e6dd26afcfc2e4d5da5ee Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:09:36 +1100
Subject: [PATCH 1/3] * Lemmatize smart quotes as plain quotes

---
 lang_data/en/lemma_rules.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index 498240be1..0336b6b9f 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -30,8 +30,8 @@
     ],
 
     "punct": [
-        ["“", "``"],
-        ["”", "''"],
+        ["“", "\""],
+        ["”", "\""],
         ["–", "--"]
     ]
 }

From 1490feda292d8065c01bb2136be5c30bbf5b23eb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:23:47 +1100
Subject: [PATCH 2/3] * Make generate_specials pretty-print the specials.json
 file

---
 lang_data/en/generate_specials.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index 1a8f1ae0b..6ad503aec 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -1,3 +1,4 @@
+# -#- coding: utf-8 -*-
 import json
 
 contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}
@@ -132,7 +133,6 @@ hardcoded_specials = {
                 "Mt.": [{"F": "Mt.", "L": "Mount"}],
 
                 "''": [{"F": "''"}],
-
                 "Corp.": [{"F": "Corp."}],
                 "Inc.": [{"F": "Inc."}],
                 "Co.": [{"F": "Co."}],
@@ -412,6 +412,6 @@ def generate_specials():
 
 if __name__ == "__main__":
     specials = generate_specials()
-    with open("specials.json", "w") as f:
-        json.dump(specials, f)
+    with open("specials.json", "w") as file_:
+        file_.write(json.dumps(specials, indent=2))
 

From 393a13d1af2a0c22a04643e61e7c4b95b653250b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:24:33 +1100
Subject: [PATCH 3/3] * Add unicode em dash to specials.json, so that we can
 control what POS tag it gets. This way we can prevent sentence boundary
 detection errors, to address Issue #130.

---
 lang_data/en/generate_specials.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index 6ad503aec..e50cd77d4 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -133,6 +133,9 @@ hardcoded_specials = {
                 "Mt.": [{"F": "Mt.", "L": "Mount"}],
 
                 "''": [{"F": "''"}],
+
+                "—": [{"F": "—", "L": "--", "P": ":"}],
+
                 "Corp.": [{"F": "Corp."}],
                 "Inc.": [{"F": "Inc."}],
                 "Co.": [{"F": "Co."}],