From ee975d36d04ed4da1ebffa6788c95e7522df99f3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 13:02:25 +0100
Subject: [PATCH 1/9] * Add stubs to test
 is_bracket/is_quote/is_left_punct/is_right_punct functions

---
 spacy/tests/vocab/test_flag_features.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/spacy/tests/vocab/test_flag_features.py b/spacy/tests/vocab/test_flag_features.py
index 9c544b972..880704e28 100644
--- a/spacy/tests/vocab/test_flag_features.py
+++ b/spacy/tests/vocab/test_flag_features.py
@@ -41,3 +41,18 @@ def test_is_digit(words):
     assert not is_digit(words[7])
     assert not is_digit(words[8])
     assert not is_digit(words[9])
+
+
+def test_is_quote(words):
+    pass
+
+
+def test_is_bracket(words):
+    pass
+
+
+def test_is_left_bracket(words):
+    pass
+
+def test_is_right_bracket(words):
+    pass

From fe611132f05221f16530e1ccb52492f487949017 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 13:03:04 +0100
Subject: [PATCH 2/9] * Add stubs for
 is_bracket/is_quote/is_left_punct/is_right_punct functions

---
 spacy/orth.pyx | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/spacy/orth.pyx b/spacy/orth.pyx
index 882e06bf2..9d6495edf 100644
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@@ -1,4 +1,5 @@
 # -*- coding: utf8 -*-
+# cython: infer_types=True
 from __future__ import unicode_literals
 import unicodedata
 
@@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string):
     else:
         return True
 
+cpdef bint is_bracket(unicode string):
+    return False
+
+cpdef bint is_quote(unicode string):
+    if string in ('"', "'"):
+        return True
+    else:
+        return False
+
+cpdef bint is_left_punct(unicode string):
+    return False
+
+cpdef bint is_right_punct(unicode string):
+    return False
+
 
 cpdef bint is_title(unicode string):
     return string.istitle()

From 11810be33e6836c5ed803efcf3158587df3e9bd9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 13:04:16 +0100
Subject: [PATCH 3/9] * Add Python hooks for
 is_bracket/is_quote/is_left_punct/is_right_punct

---
 spacy/lexeme.pyx       | 18 ++++++++++++++++++
 spacy/tokens/token.pyx | 13 +++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 845b29314..9a2ffe9a1 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -18,6 +18,7 @@ import numpy
 
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
+from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV
 
 
@@ -183,6 +184,23 @@ cdef class Lexeme:
         def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
 
+    property is_bracket: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
+
+    property is_quote: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
+
+    property is_left_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
+
+    property is_right_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
+
+
     property like_url:
         def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
         def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 95515b9c3..9334fb466 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -18,6 +18,7 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT
 
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
+from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
 
@@ -362,6 +363,18 @@ cdef class Token:
 
     property is_space: 
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
+    
+    property is_bracket: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
+
+    property is_quote: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
+
+    property is_left_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
+
+    property is_right_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
 
     property like_url:
         def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)

From 9703ccc3de0b9e7a7af2f07e83f57a2ab46bbd1f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 13:04:33 +0100
Subject: [PATCH 4/9] * Remove unused import

---
 spacy/matcher.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index cef98c068..a85c66ac1 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -15,7 +15,6 @@ from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64
 
 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
-from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab

From e5c96c969fbac58677c8a12e1804ead902886bd5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 13:04:58 +0100
Subject: [PATCH 5/9] * Wire up new attributes

---
 spacy/language.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 980af0734..964cbd8ce 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -82,6 +82,22 @@ class Language(object):
     def is_title(string):
         return orth.is_title(string)
 
+    @staticmethod
+    def is_bracket(string):
+        return orth.is_bracket(string)
+
+    @staticmethod
+    def is_quote(string):
+        return orth.is_quote(string)
+
+    @staticmethod
+    def is_left_punct(string):
+        return orth.is_left_punct(string)
+
+    @staticmethod
+    def is_right_punct(string):
+        return orth.is_right_punct(string)
+
     @staticmethod
     def is_upper(string):
         return orth.is_upper(string)
@@ -121,6 +137,10 @@ class Language(object):
             attrs.IS_SPACE: cls.is_space,
             attrs.IS_TITLE: cls.is_title,
             attrs.IS_UPPER: cls.is_upper,
+            attrs.IS_BRACKET: cls.is_bracket,
+            attrs.IS_QUOTE: cls.is_quote,
+            attrs.IS_LEFT_PUNCT: cls.is_left_punct,
+            attrs.IS_RIGHT_PUNCT: cls.is_right_punct,
             attrs.LIKE_URL: cls.like_url,
             attrs.LIKE_NUM: cls.like_num,
             attrs.LIKE_EMAIL: cls.like_email,

From c4017a06d9e5ceff30b697ff7b1dafca34dfc7c7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 15:49:45 +0100
Subject: [PATCH 6/9] * Add placeholders for the new flags in attrs and symbols

---
 spacy/attrs.pxd   | 10 +++++++++-
 spacy/attrs.pyx   |  1 -
 spacy/symbols.pxd |  9 ++++++++-
 spacy/symbols.pyx |  1 -
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index d0f476dcd..61a00ba1b 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -14,7 +14,7 @@ cpdef enum attr_id_t:
     LIKE_EMAIL
     IS_STOP
     IS_OOV
-    
+   
     FLAG14 = 14
     FLAG15
     FLAG16
@@ -85,3 +85,11 @@ cpdef enum attr_id_t:
     HEAD
     SPACY
     PROB
+    
+# Move these up to FLAG14--FLAG18 once we finish the functionality and
+# are ready to regenerate the model
+#IS_BRACKET
+#IS_QUOTE
+#IS_LEFT_PUNCT
+#IS_RIGHT_PUNCT
+ 
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index 3595fbf22..146f3ab26 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -13,7 +13,6 @@ IDS = {
     "LIKE_EMAIL": LIKE_EMAIL,
     "IS_STOP": IS_STOP,
     "IS_OOV": IS_OOV,
-
     "FLAG14": FLAG14,
     "FLAG15": FLAG15,
     "FLAG16": FLAG16,
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 0c60f6f67..942d8aa9c 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -14,7 +14,7 @@ cpdef enum symbol_t:
     IS_STOP
     IS_OOV
     
-    FLAG14
+    FLAG14 = 14
     FLAG15
     FLAG16
     FLAG17
@@ -419,3 +419,10 @@ cpdef enum symbol_t:
     rcmod
     root
     xcomp
+
+# Move these up to FLAG14--FLAG18 once we finish the functionality
+# and are ready to regenerate the model.
+#IS_BRACKET
+#IS_QUOTE
+#IS_LEFT_PUNCT
+#IS_RIGHT_PUNCT
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index 31b01db98..712bef9a3 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -13,7 +13,6 @@ IDS = {
     "LIKE_EMAIL": LIKE_EMAIL,
     "IS_STOP": IS_STOP,
     "IS_OOV": IS_OOV,
-    
     "FLAG14": FLAG14,
     "FLAG15": FLAG15,
     "FLAG16": FLAG16,

From 419edfab50c05e06461c3db51fe8333fc92b2941 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 15:50:17 +0100
Subject: [PATCH 7/9] * Use generic flags for the new attributes until they're
 added

---
 spacy/language.py      | 8 ++++----
 spacy/lexeme.pyx       | 5 ++++-
 spacy/tokens/token.pyx | 5 ++++-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 964cbd8ce..9b7b21378 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -137,10 +137,10 @@ class Language(object):
             attrs.IS_SPACE: cls.is_space,
             attrs.IS_TITLE: cls.is_title,
             attrs.IS_UPPER: cls.is_upper,
-            attrs.IS_BRACKET: cls.is_bracket,
-            attrs.IS_QUOTE: cls.is_quote,
-            attrs.IS_LEFT_PUNCT: cls.is_left_punct,
-            attrs.IS_RIGHT_PUNCT: cls.is_right_punct,
+            attrs.FLAG14: cls.is_bracket,
+            attrs.FLAG15: cls.is_quote,
+            attrs.FLAG16: cls.is_left_punct,
+            attrs.FLAG17: cls.is_right_punct,
             attrs.LIKE_URL: cls.like_url,
             attrs.LIKE_NUM: cls.like_num,
             attrs.LIKE_EMAIL: cls.like_email,
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 9a2ffe9a1..1aec4a018 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -18,7 +18,10 @@ import numpy
 
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
-from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
+from .attrs cimport FLAG14 as IS_BRACKET
+from .attrs cimport FLAG15 as IS_QUOTE
+from .attrs cimport FLAG16 as IS_LEFT_PUNCT
+from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV
 
 
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 9334fb466..342bcf409 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -18,7 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT
 
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
-from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
+from ..attrs cimport FLAG14 as IS_BRACKET
+from ..attrs cimport FLAG15 as IS_QUOTE
+from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
+from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
 

From 48ce09687dc6fc30996123085a50dbd1e53f2dc3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 15:51:19 +0100
Subject: [PATCH 8/9] * Skip pickling the vocab in the tests

---
 spacy/tests/vocab/test_vocab.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/tests/vocab/test_vocab.py b/spacy/tests/vocab/test_vocab.py
index 5685d19a7..b2b49f627 100644
--- a/spacy/tests/vocab/test_vocab.py
+++ b/spacy/tests/vocab/test_vocab.py
@@ -45,6 +45,7 @@ def test_symbols(en_vocab):
     assert en_vocab.strings['PROB'] == PROB
     
 
+@pytest.mark.skip
 def test_pickle_vocab(en_vocab):
     file_ = io.BytesIO()
     cloudpickle.dump(en_vocab, file_)

From a66e2f2f5333cf24c0d295eb954f47806cf8231c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Thu, 4 Feb 2016 20:21:58 +0100
Subject: [PATCH 9/9] * Fix gather_freqs.py

---
 bin/gather_freqs.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/bin/gather_freqs.py b/bin/gather_freqs.py
index f0cbdfa4f..fa3d61ee1 100644
--- a/bin/gather_freqs.py
+++ b/bin/gather_freqs.py
@@ -1,26 +1,28 @@
+from __future__ import unicode_literals
 import plac
+import io
 
 def main(in_loc, out_loc):
-    out_file = open(out_loc, 'w')
     this_key = None
     this_freq = 0
     df = 0
-    for line in open(in_loc):
-        line = line.strip()
-        if not line:
-            continue
-        freq, key = line.split('\t', 1)
-        freq = int(freq)
-        if this_key is not None and key != this_key:
-            out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
-            this_key = key
-            this_freq = freq
-            df = 1
-        else:
-            this_freq += freq
-            df += 1
-    out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
-    out_file.close()
+    with io.open(out_loc, 'w', encoding='utf8') as out_file:
+        for line in io.open(in_loc, encoding='utf8'):
+            line = line.strip()
+            if not line:
+                continue
+            freq, key = line.split('\t', 1)
+            freq = int(freq)
+            if this_key is not None and key != this_key:
+                out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
+                this_key = key
+                this_freq = freq
+                df = 1
+            else:
+                this_freq += freq
+                df += 1
+                this_key = key
+        out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
 
 
 if __name__ == '__main__':