From df0b68f60eda43865d4b7271c55670784b214ade Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Sep 2020 13:19:16 +0200 Subject: [PATCH] Remove unicode declarations and update language data --- spacy/lang/cs/examples.py | 10 +++----- spacy/lang/cs/lex_attrs.py | 5 +--- spacy/lang/he/lex_attrs.py | 6 ++--- spacy/lang/ne/stop_words.py | 4 --- spacy/lang/sa/__init__.py | 10 +------- spacy/lang/sa/examples.py | 4 --- spacy/lang/sa/lex_attrs.py | 32 +++++++++++------------- spacy/lang/sa/stop_words.py | 3 --- spacy/tests/lang/cs/test_text.py | 3 --- spacy/tests/lang/ne/test_text.py | 3 --- spacy/tests/lang/sa/test_text.py | 3 --- spacy/tests/regression/test_issue5838.py | 14 +++++------ spacy/tests/regression/test_issue5918.py | 3 --- 13 files changed, 27 insertions(+), 73 deletions(-) diff --git a/spacy/lang/cs/examples.py b/spacy/lang/cs/examples.py index fe8a9f6d1..a30b5ac14 100644 --- a/spacy/lang/cs/examples.py +++ b/spacy/lang/cs/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.cs.examples import sentences @@ -10,9 +6,9 @@ Example sentences to test spaCy and its language models. sentences = [ - "Máma mele maso.", + "Máma mele maso.", "Příliš žluťoučký kůň úpěl ďábelské ódy.", - "ArcGIS je geografický informační systém určený pro práci s prostorovými daty." , + "ArcGIS je geografický informační systém určený pro práci s prostorovými daty.", "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat.", "Dnes je krásné počasí.", "Nestihl autobus, protože pozdě vstal z postele.", @@ -39,4 +35,4 @@ sentences = [ "Jaké PSČ má Praha 1?", "PSČ Prahy 1 je 110 00.", "Za 20 minut jede vlak.", - ] +] diff --git a/spacy/lang/cs/lex_attrs.py b/spacy/lang/cs/lex_attrs.py index 368cab6c8..530d1d5eb 100644 --- a/spacy/lang/cs/lex_attrs.py +++ b/spacy/lang/cs/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ @@ -43,7 +40,7 @@ _num_words = [ "kvadrilion", "kvadriliarda", "kvintilion", - ] +] def like_num(text): diff --git a/spacy/lang/he/lex_attrs.py b/spacy/lang/he/lex_attrs.py index 9eab93ae4..2953e7592 100644 --- a/spacy/lang/he/lex_attrs.py +++ b/spacy/lang/he/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ @@ -73,6 +70,7 @@ _ordinal_words = [ "עשירי", ] + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -84,7 +82,7 @@ def like_num(text): num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True - + if text in _num_words: return True diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py index f008697d0..8470297b9 100644 --- a/spacy/lang/ne/stop_words.py +++ b/spacy/lang/ne/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt STOP_WORDS = set( diff --git a/spacy/lang/sa/__init__.py b/spacy/lang/sa/__init__.py index 8a4533341..345137817 100644 --- a/spacy/lang/sa/__init__.py +++ b/spacy/lang/sa/__init__.py @@ -1,18 +1,10 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS - from ...language import Language -from ...attrs import LANG class SanskritDefaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters.update(LEX_ATTRS) - lex_attr_getters[LANG] = lambda text: "sa" - + lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS diff --git a/spacy/lang/sa/examples.py b/spacy/lang/sa/examples.py index 9d4fa1e49..60243c04c 100644 --- a/spacy/lang/sa/examples.py +++ b/spacy/lang/sa/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sa/lex_attrs.py b/spacy/lang/sa/lex_attrs.py index c33be2ce4..f2b51650b 100644 --- a/spacy/lang/sa/lex_attrs.py +++ b/spacy/lang/sa/lex_attrs.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM - # reference 1: https://en.wikibooks.org/wiki/Sanskrit/Numbers _num_words = [ @@ -106,26 +102,26 @@ _num_words = [ "सप्तनवतिः", "अष्टनवतिः", "एकोनशतम्", - "शतम्" + "शतम्", ] def like_num(text): - """ + """ Check if text resembles a number """ - if text.startswith(("+", "-", "±", "~")): - text = text[1:] - text = text.replace(",", "").replace(".", "") - if text.isdigit(): - return True - if text.count("/") == 1: - num, denom = text.split("/") - if num.isdigit() and denom.isdigit(): - return True - if text in _num_words: - return True - return False + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/sa/stop_words.py b/spacy/lang/sa/stop_words.py index aa51ceae0..30302a14d 100644 --- a/spacy/lang/sa/stop_words.py +++ b/spacy/lang/sa/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://gist.github.com/Akhilesh28/fe8b8e180f64b72e64751bc31cb6d323 STOP_WORDS = set( diff --git a/spacy/tests/lang/cs/test_text.py b/spacy/tests/lang/cs/test_text.py index d98961738..b834111b9 100644 --- a/spacy/tests/lang/cs/test_text.py +++ b/spacy/tests/lang/cs/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ne/test_text.py b/spacy/tests/lang/ne/test_text.py index 794f8fbdc..7dd971132 100644 --- a/spacy/tests/lang/ne/test_text.py +++ b/spacy/tests/lang/ne/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sa/test_text.py b/spacy/tests/lang/sa/test_text.py index 7c961bdae..41257a4d8 100644 --- a/spacy/tests/lang/sa/test_text.py +++ b/spacy/tests/lang/sa/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/regression/test_issue5838.py b/spacy/tests/regression/test_issue5838.py index c008c5aec..4e4d98beb 100644 --- a/spacy/tests/regression/test_issue5838.py +++ b/spacy/tests/regression/test_issue5838.py @@ -1,15 +1,13 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokens import Span from spacy import displacy -SAMPLE_TEXT = '''First line + +SAMPLE_TEXT = """First line Second line, with ent Third line Fourth line -''' +""" def test_issue5838(): @@ -18,8 +16,8 @@ def test_issue5838(): nlp = English() doc = nlp(SAMPLE_TEXT) - doc.ents = [Span(doc, 7, 8, label='test')] + doc.ents = [Span(doc, 7, 8, label="test")] - html = displacy.render(doc, style='ent') - found = html.count('
') + html = displacy.render(doc, style="ent") + found = html.count("
") assert found == 4 diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py index 2dee26d82..3b96009a8 100644 --- a/spacy/tests/regression/test_issue5918.py +++ b/spacy/tests/regression/test_issue5918.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import merge_entities, EntityRuler