From 40bb918a4c8507f5c54a722e0388eda1da1e2b7a Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 21 Jun 2020 22:34:10 +0200
Subject: [PATCH] Remove unicode declarations and tidy up

---
 spacy/lang/es/punctuation.py             |  3 -
 spacy/lang/gu/__init__.py                |  3 -
 spacy/lang/gu/examples.py                |  4 --
 spacy/lang/gu/stop_words.py              |  3 -
 spacy/lang/hy/__init__.py                |  3 -
 spacy/lang/hy/examples.py                |  3 -
 spacy/lang/hy/lex_attrs.py               |  3 -
 spacy/lang/hy/stop_words.py              |  3 -
 spacy/lang/hy/tag_map.py                 |  3 -
 spacy/lang/ja/bunsetu.py                 | 92 ++++++++++++++++--------
 spacy/lang/ja/syntax_iterators.py        | 29 ++++----
 spacy/lang/kn/examples.py                |  4 --
 spacy/lang/ml/__init__.py                |  3 -
 spacy/lang/ml/examples.py                |  4 --
 spacy/lang/ml/lex_attrs.py               |  3 -
 spacy/lang/ml/stop_words.py              |  4 --
 spacy/lang/pl/lemmatizer.py              |  3 -
 spacy/lang/sv/lex_attrs.py               |  3 -
 spacy/tests/lang/de/test_noun_chunks.py  |  3 -
 spacy/tests/lang/el/test_noun_chunks.py  |  3 -
 spacy/tests/lang/es/test_noun_chunks.py  |  3 -
 spacy/tests/lang/fa/test_noun_chunks.py  |  3 -
 spacy/tests/lang/fr/test_noun_chunks.py  |  3 -
 spacy/tests/lang/gu/test_text.py         |  3 -
 spacy/tests/lang/hy/test_text.py         |  3 -
 spacy/tests/lang/hy/test_tokenizer.py    |  3 -
 spacy/tests/lang/id/test_noun_chunks.py  |  3 -
 spacy/tests/lang/ja/test_serialize.py    |  4 --
 spacy/tests/lang/ml/test_text.py         |  3 -
 spacy/tests/lang/nb/test_noun_chunks.py  |  3 -
 spacy/tests/lang/sv/test_lex_attrs.py    |  3 -
 spacy/tests/lang/zh/test_serialize.py    |  3 -
 spacy/tests/regression/test_issue5152.py |  3 -
 spacy/tests/regression/test_issue5230.py |  1 -
 spacy/tests/regression/test_issue5458.py |  3 -
 35 files changed, 76 insertions(+), 147 deletions(-)

diff --git a/spacy/lang/es/punctuation.py b/spacy/lang/es/punctuation.py
index f989221c2..e9552371e 100644
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
diff --git a/spacy/lang/gu/__init__.py b/spacy/lang/gu/__init__.py
index 1f080c7c2..bc8fc260c 100644
--- a/spacy/lang/gu/__init__.py
+++ b/spacy/lang/gu/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 
 from ...language import Language
diff --git a/spacy/lang/gu/examples.py b/spacy/lang/gu/examples.py
index 202a8d022..1cf75fd32 100644
--- a/spacy/lang/gu/examples.py
+++ b/spacy/lang/gu/examples.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.
 
diff --git a/spacy/lang/gu/stop_words.py b/spacy/lang/gu/stop_words.py
index 85d33763d..2c859681b 100644
--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 STOP_WORDS = set(
     """
 એમ
diff --git a/spacy/lang/hy/__init__.py b/spacy/lang/hy/__init__.py
index 6aaa965bb..8928e52ae 100644
--- a/spacy/lang/hy/__init__.py
+++ b/spacy/lang/hy/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py
index 323f77b1c..69e354688 100644
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py
index b556d679c..f84472d60 100644
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM
 
 
diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py
index d75aad6e2..46d0f6b51 100644
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 STOP_WORDS = set(
     """
 նա
diff --git a/spacy/lang/hy/tag_map.py b/spacy/lang/hy/tag_map.py
index 722270110..09be1fd8d 100644
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 
diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py
index 7c3eee336..e8c802246 100644
--- a/spacy/lang/ja/bunsetu.py
+++ b/spacy/lang/ja/bunsetu.py
@@ -1,21 +1,11 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from .stop_words import STOP_WORDS
-
-
 POS_PHRASE_MAP = {
     "NOUN": "NP",
     "NUM": "NP",
     "PRON": "NP",
     "PROPN": "NP",
-
     "VERB": "VP",
-
     "ADJ": "ADJP",
-
     "ADV": "ADVP",
-
     "CCONJ": "CCONJP",
 }
 
@@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
         dep = t.dep_
         head = t.head.i
         if debug:
-            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
+            print(
+                t.i,
+                t.orth_,
+                pos,
+                pos_type,
+                dep,
+                head,
+                bunsetu_may_end,
+                phrase_type,
+                phrase,
+                bunsetu,
+            )
 
         # DET is always an individual bunsetu
         if pos == "DET":
@@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):
 
         # entering new bunsetu
         elif pos_type and (
-            pos_type != phrase_type or  # different phrase type arises
-            bunsetu_may_end  # same phrase type but bunsetu already ended
+            pos_type != phrase_type
+            or bunsetu_may_end  # different phrase type arises  # same phrase type but bunsetu already ended
         ):
             # exceptional case: NOUN to VERB
-            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
+            if (
+                phrase_type == "NP"
+                and pos_type == "VP"
+                and prev_dep == "compound"
+                and prev_head == t.i
+            ):
                 bunsetu.append(t)
                 phrase_type = "VP"
                 phrase.append(t)
             # exceptional case: VERB to NOUN
-            elif phrase_type == "VP" and pos_type == "NP" and (
-                    prev_dep == 'compound' and prev_head == t.i or
-                    dep == 'compound' and prev == head or
-                    prev_dep == 'nmod' and prev_head == t.i
+            elif (
+                phrase_type == "VP"
+                and pos_type == "NP"
+                and (
+                    prev_dep == "compound"
+                    and prev_head == t.i
+                    or dep == "compound"
+                    and prev == head
+                    or prev_dep == "nmod"
+                    and prev_head == t.i
+                )
             ):
                 bunsetu.append(t)
                 phrase_type = "NP"
@@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
         # NOUN bunsetu
         elif phrase_type == "NP":
             bunsetu.append(t)
-            if not bunsetu_may_end and ((
-                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
-            ) or (
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
-            )):
+            if not bunsetu_may_end and (
+                (
+                    (pos_type == "NP" or pos == "SYM")
+                    and (prev_head == t.i or prev_head == head)
+                    and prev_dep in {"compound", "nummod"}
+                )
+                or (
+                    pos == "PART"
+                    and (prev == head or prev_head == head)
+                    and dep == "mark"
+                )
+            ):
                 phrase.append(t)
             else:
                 bunsetu_may_end = True
@@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
         # VERB bunsetu
         elif phrase_type == "VP":
             bunsetu.append(t)
-            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
+            if (
+                not bunsetu_may_end
+                and pos == "VERB"
+                and prev_head == t.i
+                and prev_dep == "compound"
+            ):
                 phrase.append(t)
             else:
                 bunsetu_may_end = True
 
         # ADJ bunsetu
-        elif phrase_type == "ADJP" and tag != '連体詞':
+        elif phrase_type == "ADJP" and tag != "連体詞":
             bunsetu.append(t)
-            if not bunsetu_may_end and ((
-                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
-            ) or (
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
-            )):
+            if not bunsetu_may_end and (
+                (
+                    pos == "NOUN"
+                    and (prev_head == t.i or prev_head == head)
+                    and prev_dep in {"amod", "compound"}
+                )
+                or (
+                    pos == "PART"
+                    and (prev == head or prev_head == head)
+                    and dep == "mark"
+                )
+            ):
                 phrase.append(t)
             else:
                 bunsetu_may_end = True
diff --git a/spacy/lang/ja/syntax_iterators.py b/spacy/lang/ja/syntax_iterators.py
index cd1e4fde7..3f6e4bfa3 100644
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@@ -1,24 +1,22 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import NOUN, PROPN, PRON, VERB
 
 # XXX this can probably be pruned a bit
 labels = [
-        "nsubj",
-        "nmod",
-        "dobj",
-        "nsubjpass",
-        "pcomp",
-        "pobj",
-        "obj",
-        "obl",
-        "dative",
-        "appos",
-        "attr",
-        "ROOT",
+    "nsubj",
+    "nmod",
+    "dobj",
+    "nsubjpass",
+    "pcomp",
+    "pobj",
+    "obj",
+    "obl",
+    "dative",
+    "appos",
+    "attr",
+    "ROOT",
 ]
 
+
 def noun_chunks(obj):
     """
     Detect base noun phrases from a dependency parse. Works on both Doc and Span.
@@ -52,4 +50,5 @@ def noun_chunks(obj):
                 seen.update(w.i for w in word.head.rights)
             yield unseen[0], word.i + 1, np_label
 
+
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/kn/examples.py b/spacy/lang/kn/examples.py
index d82630432..3e055752e 100644
--- a/spacy/lang/kn/examples.py
+++ b/spacy/lang/kn/examples.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.
 
diff --git a/spacy/lang/ml/__init__.py b/spacy/lang/ml/__init__.py
index d052ded1b..e92a7617f 100644
--- a/spacy/lang/ml/__init__.py
+++ b/spacy/lang/ml/__init__.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 
 from ...language import Language
diff --git a/spacy/lang/ml/examples.py b/spacy/lang/ml/examples.py
index a2a0ed10e..9794eab29 100644
--- a/spacy/lang/ml/examples.py
+++ b/spacy/lang/ml/examples.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.
 
diff --git a/spacy/lang/ml/lex_attrs.py b/spacy/lang/ml/lex_attrs.py
index 468ad88f8..9ac19b6a7 100644
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM
 
 
diff --git a/spacy/lang/ml/stop_words.py b/spacy/lang/ml/stop_words.py
index 8bd6a7e02..441e93586 100644
--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 STOP_WORDS = set(
     """
 അത്
diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py
index 8b8d7fe27..b80a1a143 100644
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES
 
diff --git a/spacy/lang/sv/lex_attrs.py b/spacy/lang/sv/lex_attrs.py
index 24d06a97a..f8ada9e2e 100644
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM
 
 
diff --git a/spacy/tests/lang/de/test_noun_chunks.py b/spacy/tests/lang/de/test_noun_chunks.py
index 8d76ddd79..ff9f8d5e5 100644
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/el/test_noun_chunks.py b/spacy/tests/lang/el/test_noun_chunks.py
index 4f24865d0..38e72b0b2 100644
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/es/test_noun_chunks.py b/spacy/tests/lang/es/test_noun_chunks.py
index 66bbd8c3a..a7ec4e562 100644
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/fa/test_noun_chunks.py b/spacy/tests/lang/fa/test_noun_chunks.py
index a98aae061..767e91f6b 100644
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/fr/test_noun_chunks.py b/spacy/tests/lang/fr/test_noun_chunks.py
index ea93a5a35..5fd6897f7 100644
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/gu/test_text.py b/spacy/tests/lang/gu/test_text.py
index aa8d442a2..2d251166f 100644
--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py
index cbdb77e4e..ac0f1e128 100644
--- a/spacy/tests/lang/hy/test_text.py
+++ b/spacy/tests/lang/hy/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest
 from spacy.lang.hy.lex_attrs import like_num
 
diff --git a/spacy/tests/lang/hy/test_tokenizer.py b/spacy/tests/lang/hy/test_tokenizer.py
index 3eeb8b54e..e9efb224a 100644
--- a/spacy/tests/lang/hy/test_tokenizer.py
+++ b/spacy/tests/lang/hy/test_tokenizer.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/id/test_noun_chunks.py b/spacy/tests/lang/id/test_noun_chunks.py
index add76f9b9..445643933 100644
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/ja/test_serialize.py b/spacy/tests/lang/ja/test_serialize.py
index 018e645bb..9e703e63d 100644
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
 from spacy.lang.ja import Japanese
 from ...util import make_tempdir
 
diff --git a/spacy/tests/lang/ml/test_text.py b/spacy/tests/lang/ml/test_text.py
index 2883cf5bb..aced78461 100644
--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/nb/test_noun_chunks.py b/spacy/tests/lang/nb/test_noun_chunks.py
index 653491a64..c6a00354b 100644
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 
 
diff --git a/spacy/tests/lang/sv/test_lex_attrs.py b/spacy/tests/lang/sv/test_lex_attrs.py
index abe6b0f7b..656c4706b 100644
--- a/spacy/tests/lang/sv/test_lex_attrs.py
+++ b/spacy/tests/lang/sv/test_lex_attrs.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 from spacy.lang.sv.lex_attrs import like_num
 
diff --git a/spacy/tests/lang/zh/test_serialize.py b/spacy/tests/lang/zh/test_serialize.py
index 56f092ed8..d84920c3e 100644
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 from spacy.lang.zh import Chinese
 from ...util import make_tempdir
diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py
index 758ac9c14..a9a57746d 100644
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from spacy.lang.en import English
 
 
diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py
index b46bf9063..9ffa3862c 100644
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@@ -1,4 +1,3 @@
-# coding: utf8
 import warnings
 from unittest import TestCase
 import pytest
diff --git a/spacy/tests/regression/test_issue5458.py b/spacy/tests/regression/test_issue5458.py
index 3281e2a8c..a7a2959df 100644
--- a/spacy/tests/regression/test_issue5458.py
+++ b/spacy/tests/regression/test_issue5458.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 from spacy.lang.en import English
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.tests.util import get_doc