From 45bc78461c9512c2ab68b4b0785d157686a1a763 Mon Sep 17 00:00:00 2001 From: shuvanon Date: Wed, 8 Mar 2017 17:27:12 +0600 Subject: [PATCH 1/4] update tokenizertokenizer --- spacy/bn/tokenizer_exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/bn/tokenizer_exceptions.py b/spacy/bn/tokenizer_exceptions.py index 7722c9dcc..169608d0b 100644 --- a/spacy/bn/tokenizer_exceptions.py +++ b/spacy/bn/tokenizer_exceptions.py @@ -31,6 +31,10 @@ ABBREVIATIONS = { {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, {ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, ], + "সে.মি": [ + {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, + {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, + ], } TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) From 85438aee1b54431a11d290e95270d34e1372a1b4 Mon Sep 17 00:00:00 2001 From: shuvanon Date: Wed, 8 Mar 2017 17:29:39 +0600 Subject: [PATCH 2/4] update tokenizertokenizer --- spacy/bn/tokenizer_exceptions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/bn/tokenizer_exceptions.py b/spacy/bn/tokenizer_exceptions.py index 169608d0b..ac8a95b3f 100644 --- a/spacy/bn/tokenizer_exceptions.py +++ b/spacy/bn/tokenizer_exceptions.py @@ -35,6 +35,9 @@ ABBREVIATIONS = { {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, ], + "মি.লি.": [ + {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}, + ] } TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) From c251703428cbebea460e6787710dd3d1412caccc Mon Sep 17 00:00:00 2001 From: Shuvanon Razik Date: Fri, 10 Mar 2017 10:45:01 +0600 Subject: [PATCH 3/4] Update abbreviations --- spacy/bn/tokenizer_exceptions.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/bn/tokenizer_exceptions.py b/spacy/bn/tokenizer_exceptions.py index ac8a95b3f..b47c89add 100644 --- a/spacy/bn/tokenizer_exceptions.py +++ b/spacy/bn/tokenizer_exceptions.py @@ -27,14 +27,18 @@ ABBREVIATIONS = { "সে.": [ {ORTH: "সে.", LEMMA: "সেলসিয়াস"}, ], - "কি.মি": [ - {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, + "কি.মি.": [ {ORTH: "কি.মি.", LEMMA: "কিলোমিটার"}, ], - "সে.মি": [ - {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, + "কি.মি": [ + {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, + ] + "সে.মি.": [ {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, ], + "সে.মি": [ + {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, + ], "মি.লি.": [ {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}, ] From 1c408903213c268423f7824d2823e493c57f2f0b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 10 Mar 2017 09:34:54 +0100 Subject: [PATCH 4/4] Add missing comma Should fix Travis build error --- spacy/bn/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/bn/tokenizer_exceptions.py b/spacy/bn/tokenizer_exceptions.py index b47c89add..a47b89280 100644 --- a/spacy/bn/tokenizer_exceptions.py +++ b/spacy/bn/tokenizer_exceptions.py @@ -32,7 +32,7 @@ ABBREVIATIONS = { ], "কি.মি": [ {ORTH: "কি.মি", LEMMA: "কিলোমিটার"}, - ] + ], "সে.মি.": [ {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, ],