From 40bb918a4c8507f5c54a722e0388eda1da1e2b7a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 21 Jun 2020 22:34:10 +0200 Subject: [PATCH] Remove unicode declarations and tidy up --- spacy/lang/es/punctuation.py | 3 - spacy/lang/gu/__init__.py | 3 - spacy/lang/gu/examples.py | 4 -- spacy/lang/gu/stop_words.py | 3 - spacy/lang/hy/__init__.py | 3 - spacy/lang/hy/examples.py | 3 - spacy/lang/hy/lex_attrs.py | 3 - spacy/lang/hy/stop_words.py | 3 - spacy/lang/hy/tag_map.py | 3 - spacy/lang/ja/bunsetu.py | 92 ++++++++++++++++-------- spacy/lang/ja/syntax_iterators.py | 29 ++++---- spacy/lang/kn/examples.py | 4 -- spacy/lang/ml/__init__.py | 3 - spacy/lang/ml/examples.py | 4 -- spacy/lang/ml/lex_attrs.py | 3 - spacy/lang/ml/stop_words.py | 4 -- spacy/lang/pl/lemmatizer.py | 3 - spacy/lang/sv/lex_attrs.py | 3 - spacy/tests/lang/de/test_noun_chunks.py | 3 - spacy/tests/lang/el/test_noun_chunks.py | 3 - spacy/tests/lang/es/test_noun_chunks.py | 3 - spacy/tests/lang/fa/test_noun_chunks.py | 3 - spacy/tests/lang/fr/test_noun_chunks.py | 3 - spacy/tests/lang/gu/test_text.py | 3 - spacy/tests/lang/hy/test_text.py | 3 - spacy/tests/lang/hy/test_tokenizer.py | 3 - spacy/tests/lang/id/test_noun_chunks.py | 3 - spacy/tests/lang/ja/test_serialize.py | 4 -- spacy/tests/lang/ml/test_text.py | 3 - spacy/tests/lang/nb/test_noun_chunks.py | 3 - spacy/tests/lang/sv/test_lex_attrs.py | 3 - spacy/tests/lang/zh/test_serialize.py | 3 - spacy/tests/regression/test_issue5152.py | 3 - spacy/tests/regression/test_issue5230.py | 1 - spacy/tests/regression/test_issue5458.py | 3 - 35 files changed, 76 insertions(+), 147 deletions(-) diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py index f989221c2..e9552371e 100644 --- a/spacy/lang/es/punctuation.py +++ b/spacy/lang/es/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py index 1f080c7c2..bc8fc260c 100644 --- a/spacy/lang/gu/__init__.py +++ b/spacy/lang/gu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py index 202a8d022..1cf75fd32 100644 --- a/spacy/lang/gu/examples.py +++ b/spacy/lang/gu/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py index 85d33763d..2c859681b 100644 --- a/spacy/lang/gu/stop_words.py +++ b/spacy/lang/gu/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ એમ diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py index 6aaa965bb..8928e52ae 100644 --- a/spacy/lang/hy/__init__.py +++ b/spacy/lang/hy/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index 323f77b1c..69e354688 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.hy.examples import sentences diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index b556d679c..f84472d60 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index d75aad6e2..46d0f6b51 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ նա diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py index 722270110..09be1fd8d 100644 --- a/spacy/lang/hy/tag_map.py +++ b/spacy/lang/hy/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py index 7c3eee336..e8c802246 100644 --- a/spacy/lang/ja/bunsetu.py +++ b/spacy/lang/ja/bunsetu.py @@ -1,21 +1,11 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .stop_words import STOP_WORDS - - POS_PHRASE_MAP = { "NOUN": "NP", "NUM": "NP", "PRON": "NP", "PROPN": "NP", - "VERB": "VP", - "ADJ": "ADJP", - "ADV": "ADVP", - "CCONJ": "CCONJP", } @@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False): dep = t.dep_ head = t.head.i if debug: - print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) + print( + t.i, + t.orth_, + pos, + pos_type, + dep, + head, + bunsetu_may_end, + phrase_type, + phrase, + bunsetu, + ) # DET is always an individual bunsetu if pos == "DET": @@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False): # entering new bunsetu elif pos_type and ( - pos_type != phrase_type or # different phrase type arises - bunsetu_may_end # same phrase type but bunsetu already ended + pos_type != phrase_type + or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended ): # exceptional case: NOUN to VERB - if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: + if ( + phrase_type == "NP" + and pos_type == "VP" + and prev_dep == "compound" + and prev_head == t.i + ): bunsetu.append(t) phrase_type = "VP" phrase.append(t) # exceptional case: VERB to NOUN - elif phrase_type == "VP" and pos_type == "NP" and ( - prev_dep == 'compound' and prev_head == t.i or - dep == 'compound' and prev == head or - prev_dep == 'nmod' and prev_head == t.i + elif ( + phrase_type == "VP" + and pos_type == "NP" + and ( + prev_dep == "compound" + and prev_head == t.i + or dep == "compound" + and prev == head + or prev_dep == "nmod" + and prev_head == t.i + ) ): bunsetu.append(t) phrase_type = "NP" @@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False): # NOUN bunsetu elif phrase_type == "NP": bunsetu.append(t) - if not bunsetu_may_end and (( - (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): + if not bunsetu_may_end and ( + ( + (pos_type == "NP" or pos == "SYM") + and (prev_head == t.i or prev_head == head) + and prev_dep in {"compound", "nummod"} + ) + or ( + pos == "PART" + and (prev == head or prev_head == head) + and dep == "mark" + ) + ): phrase.append(t) else: bunsetu_may_end = True @@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False): # VERB bunsetu elif phrase_type == "VP": bunsetu.append(t) - if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': + if ( + not bunsetu_may_end + and pos == "VERB" + and prev_head == t.i + and prev_dep == "compound" + ): phrase.append(t) else: bunsetu_may_end = True # ADJ bunsetu - elif phrase_type == "ADJP" and tag != '連体詞': + elif phrase_type == "ADJP" and tag != "連体詞": bunsetu.append(t) - if not bunsetu_may_end and (( - pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): + if not bunsetu_may_end and ( + ( + pos == "NOUN" + and (prev_head == t.i or prev_head == head) + and prev_dep in {"amod", "compound"} + ) + or ( + pos == "PART" + and (prev == head or prev_head == head) + and dep == "mark" + ) + ): phrase.append(t) else: bunsetu_may_end = True diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py index cd1e4fde7..3f6e4bfa3 100644 --- a/spacy/lang/ja/syntax_iterators.py +++ b/spacy/lang/ja/syntax_iterators.py @@ -1,24 +1,22 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON, VERB # XXX this can probably be pruned a bit labels = [ - "nsubj", - "nmod", - "dobj", - "nsubjpass", - "pcomp", - "pobj", - "obj", - "obl", - "dative", - "appos", - "attr", - "ROOT", + "nsubj", + "nmod", + "dobj", + "nsubjpass", + "pcomp", + "pobj", + "obj", + "obl", + "dative", + "appos", + "attr", + "ROOT", ] + def noun_chunks(obj): """ Detect base noun phrases from a dependency parse. Works on both Doc and Span. @@ -52,4 +50,5 @@ def noun_chunks(obj): seen.update(w.i for w in word.head.rights) yield unseen[0], word.i + 1, np_label + SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py index d82630432..3e055752e 100644 --- a/spacy/lang/kn/examples.py +++ b/spacy/lang/kn/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py index d052ded1b..e92a7617f 100644 --- a/spacy/lang/ml/__init__.py +++ b/spacy/lang/ml/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py index a2a0ed10e..9794eab29 100644 --- a/spacy/lang/ml/examples.py +++ b/spacy/lang/ml/examples.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py index 468ad88f8..9ac19b6a7 100644 --- a/spacy/lang/ml/lex_attrs.py +++ b/spacy/lang/ml/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py index 8bd6a7e02..441e93586 100644 --- a/spacy/lang/ml/stop_words.py +++ b/spacy/lang/ml/stop_words.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ അത് diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 8b8d7fe27..b80a1a143 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...parts_of_speech import NAMES diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py index 24d06a97a..f8ada9e2e 100644 --- a/spacy/lang/sv/lex_attrs.py +++ b/spacy/lang/sv/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py index 8d76ddd79..ff9f8d5e5 100644 --- a/spacy/tests/lang/de/test_noun_chunks.py +++ b/spacy/tests/lang/de/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py index 4f24865d0..38e72b0b2 100644 --- a/spacy/tests/lang/el/test_noun_chunks.py +++ b/spacy/tests/lang/el/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py index 66bbd8c3a..a7ec4e562 100644 --- a/spacy/tests/lang/es/test_noun_chunks.py +++ b/spacy/tests/lang/es/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py index a98aae061..767e91f6b 100644 --- a/spacy/tests/lang/fa/test_noun_chunks.py +++ b/spacy/tests/lang/fa/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py index ea93a5a35..5fd6897f7 100644 --- a/spacy/tests/lang/fr/test_noun_chunks.py +++ b/spacy/tests/lang/fr/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py index aa8d442a2..2d251166f 100644 --- a/spacy/tests/lang/gu/test_text.py +++ b/spacy/tests/lang/gu/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py index cbdb77e4e..ac0f1e128 100644 --- a/spacy/tests/lang/hy/test_text.py +++ b/spacy/tests/lang/hy/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.hy.lex_attrs import like_num diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py index 3eeb8b54e..e9efb224a 100644 --- a/spacy/tests/lang/hy/test_tokenizer.py +++ b/spacy/tests/lang/hy/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py index add76f9b9..445643933 100644 --- a/spacy/tests/lang/id/test_noun_chunks.py +++ b/spacy/tests/lang/id/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py index 018e645bb..9e703e63d 100644 --- a/spacy/tests/lang/ja/test_serialize.py +++ b/spacy/tests/lang/ja/test_serialize.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest from spacy.lang.ja import Japanese from ...util import make_tempdir diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py index 2883cf5bb..aced78461 100644 --- a/spacy/tests/lang/ml/test_text.py +++ b/spacy/tests/lang/ml/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py index 653491a64..c6a00354b 100644 --- a/spacy/tests/lang/nb/test_noun_chunks.py +++ b/spacy/tests/lang/nb/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py index abe6b0f7b..656c4706b 100644 --- a/spacy/tests/lang/sv/test_lex_attrs.py +++ b/spacy/tests/lang/sv/test_lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.sv.lex_attrs import like_num diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py index 56f092ed8..d84920c3e 100644 --- a/spacy/tests/lang/zh/test_serialize.py +++ b/spacy/tests/lang/zh/test_serialize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.zh import Chinese from ...util import make_tempdir diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index 758ac9c14..a9a57746d 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index b46bf9063..9ffa3862c 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,4 +1,3 @@ -# coding: utf8 import warnings from unittest import TestCase import pytest diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py index 3281e2a8c..a7a2959df 100644 --- a/spacy/tests/regression/test_issue5458.py +++ b/spacy/tests/regression/test_issue5458.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.tests.util import get_doc