From 1842a53e73405be3048e6dd26afcfc2e4d5da5ee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:09:36 +1100 Subject: [PATCH 1/3] * Lemmatize smart quotes as plain quotes --- lang_data/en/lemma_rules.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index 498240be1..0336b6b9f 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -30,8 +30,8 @@ ], "punct": [ - ["“", "``"], - ["”", "''"], + ["“", "\""], + ["”", "\""], ["–", "--"] ] } From 1490feda292d8065c01bb2136be5c30bbf5b23eb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:23:47 +1100 Subject: [PATCH 2/3] * Make generate_specials pretty-print the specials.json file --- lang_data/en/generate_specials.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 1a8f1ae0b..6ad503aec 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -1,3 +1,4 @@ +# -#- coding: utf-8 -*- import json contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"} @@ -132,7 +133,6 @@ hardcoded_specials = { "Mt.": [{"F": "Mt.", "L": "Mount"}], "''": [{"F": "''"}], - "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], "Co.": [{"F": "Co."}], @@ -412,6 +412,6 @@ def generate_specials(): if __name__ == "__main__": specials = generate_specials() - with open("specials.json", "w") as f: - json.dump(specials, f) + with open("specials.json", "w") as file_: + file_.write(json.dumps(specials, indent=2)) From 393a13d1af2a0c22a04643e61e7c4b95b653250b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:24:33 +1100 Subject: [PATCH 3/3] * Add unicode em dash to specials.json, so that we can control what POS tag it gets. This way we can prevent sentence boundary detection errors, to address Issue #130. --- lang_data/en/generate_specials.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 6ad503aec..e50cd77d4 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -133,6 +133,9 @@ hardcoded_specials = { "Mt.": [{"F": "Mt.", "L": "Mount"}], "''": [{"F": "''"}], + + "—": [{"F": "—", "L": "--", "P": ":"}], + "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], "Co.": [{"F": "Co."}],