From 8199012d26b12caf0a3791676e213c5a29966be0 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Wed, 30 Sep 2015 20:10:15 +0200
Subject: [PATCH 01/62] changing deprecated codecs.open to io.open =)

---
 spacy/util.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 1d48ab7e9..34a660c4c 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1,5 +1,5 @@
 from os import path
-import codecs
+import io
 import json
 import re
 
@@ -7,7 +7,7 @@ DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
 
 
 def utf8open(loc, mode='r'):
-    return codecs.open(loc, mode, 'utf8')
+    return io.open(loc, mode, encoding='utf8')
 
 
 def read_lang_data(data_dir):

From 764bdc62e7f4e91ef571d6b655da8e53b7839447 Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Wed, 30 Sep 2015 20:16:52 +0200
Subject: [PATCH 02/62] caught another codecs.open

---
 bin/parser/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/parser/train.py b/bin/parser/train.py
index 267b26275..57889511d 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -5,7 +5,7 @@ from __future__ import unicode_literals
 import os
 from os import path
 import shutil
-import codecs
+import io
 import random
 
 import plac
@@ -169,7 +169,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
     nlp = Language()
     gold_tuples = read_docparse_file(dev_loc)
     scorer = Scorer()
-    out_file = codecs.open(out_loc, 'w', 'utf8')
+    out_file = io.open(out_loc, 'w', encoding='utf8')
     for raw_text, segmented_text, annot_tuples in gold_tuples:
         tokens = nlp(raw_text)
         for t in tokens:

From 8caedba42a5255b9996533a732e17eee3f20a2dd Mon Sep 17 00:00:00 2001
From: alvations <alvations@gmail.com>
Date: Wed, 30 Sep 2015 20:20:09 +0200
Subject: [PATCH 03/62] caught more codecs.open -> io.open

---
 bin/init_model.py            | 6 +++---
 bin/ner_tag.py               | 4 ++--
 bin/prepare_treebank.py      | 4 ++--
 spacy/en/lemmatizer.py       | 6 +++---
 spacy/gold.pyx               | 2 +-
 spacy/strings.pyx            | 6 +++---
 spacy/vocab.pyx              | 2 +-
 tests/test_parse_navigate.py | 4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/bin/init_model.py b/bin/init_model.py
index a75bd9827..ba99808f0 100644
--- a/bin/init_model.py
+++ b/bin/init_model.py
@@ -20,7 +20,7 @@ from pathlib import Path
 
 from shutil import copyfile
 from shutil import copytree
-import codecs
+import io
 
 from spacy.en import get_lex_props
 from spacy.vocab import Vocab
@@ -41,7 +41,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
 
 def _read_clusters(loc):
     clusters = {}
-    for line in codecs.open(str(loc), 'r', 'utf8'):
+    for line in io.open(str(loc), 'r', encoding='utf8'):
         try:
             cluster, word, freq = line.split()
         except ValueError:
@@ -65,7 +65,7 @@ def _read_clusters(loc):
 
 def _read_probs(loc):
     probs = {}
-    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
+    for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')):
         prob, word = line.split()
         prob = float(prob)
         probs[word] = prob
diff --git a/bin/ner_tag.py b/bin/ner_tag.py
index 34588bd12..f990f21a1 100644
--- a/bin/ner_tag.py
+++ b/bin/ner_tag.py
@@ -1,11 +1,11 @@
-import codecs
+import io
 import plac
 
 from spacy.en import English
 
 
 def main(text_loc):
-    with codecs.open(text_loc, 'r', 'utf8') as file_:
+    with io.open(text_loc, 'r', encoding='utf8') as file_:
         text = file_.read()
     NLU = English()
     for paragraph in text.split('\n\n'):
diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py
index d13ef7130..f9f4eec21 100644
--- a/bin/prepare_treebank.py
+++ b/bin/prepare_treebank.py
@@ -27,7 +27,7 @@ import json
 from os import path
 import os
 import re
-import codecs
+import io
 from collections import defaultdict
 
 from spacy.munge import read_ptb
@@ -122,7 +122,7 @@ def read_file(*pieces):
     if not path.exists(loc):
         return None
     else:
-        return codecs.open(loc, 'r', 'utf8').read().strip()
+        return io.open(loc, 'r', encoding='utf8').read().strip()
 
 
 def get_file_names(section_dir, subsection):
diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py
index 5883e12c8..a9625f0e9 100644
--- a/spacy/en/lemmatizer.py
+++ b/spacy/en/lemmatizer.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 from os import path
-import codecs
+import io
 
 
 NOUN_RULES = (
@@ -85,7 +85,7 @@ def lemmatize(string, index, exceptions, rules):
 
 def read_index(loc):
     index = set()
-    for line in codecs.open(loc, 'r', 'utf8'):
+    for line in io.open(loc, 'r', encoding='utf8'):
         if line.startswith(' '):
             continue
         pieces = line.split()
@@ -97,7 +97,7 @@ def read_index(loc):
 
 def read_exc(loc):
     exceptions = {}
-    for line in codecs.open(loc, 'r', 'utf8'):
+    for line in io.open(loc, 'r', encoding='utf8'):
         if line.startswith(' '):
             continue
         pieces = line.split()
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index cab4ba8a1..4fe5c6b52 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -1,5 +1,5 @@
 import numpy
-import codecs
+import io
 import json
 import ujson
 import random
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index e15f88837..8cf735bb6 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,4 +1,4 @@
-import codecs
+import io
 
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
@@ -112,11 +112,11 @@ cdef class StringStore:
             string = &self.strings[i]
             py_string = string.chars[:string.length]
             strings.append(py_string.decode('utf8'))
-        with codecs.open(loc, 'w', 'utf8') as file_:
+        with io.open(loc, 'w', encoding='utf8') as file_:
             file_.write(SEPARATOR.join(strings))
 
     def load(self, loc):
-        with codecs.open(loc, 'r', 'utf8') as file_:
+        with io.open(loc, 'r', encoding='utf8') as file_:
             strings = file_.read().split(SEPARATOR)
         cdef unicode string
         cdef bytes byte_string
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index c93e4202f..475b06dd1 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -4,7 +4,7 @@ from libc.stdint cimport int32_t
 
 import bz2
 from os import path
-import codecs
+import io
 import math
 
 from .lexeme cimport EMPTY_LEXEME
diff --git a/tests/test_parse_navigate.py b/tests/test_parse_navigate.py
index cf6971c89..1fff0f684 100644
--- a/tests/test_parse_navigate.py
+++ b/tests/test_parse_navigate.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 from os import path
-import codecs
+import io
 
 from spacy.en import English
 
@@ -9,7 +9,7 @@ import pytest
 
 @pytest.fixture
 def sun_text():
-    with codecs.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', 'utf8') as file_:
+    with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_:
         text = file_.read()
     return text
 

From 73566899bf3bde655a9437af601fe5744f700a66 Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Tue, 6 Oct 2015 00:51:25 -0700
Subject: [PATCH 04/62] Add Doc slicing tests

---
 tests/tokens/test_tokens_api.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py
index e1238373f..a7311932f 100644
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@@ -12,6 +12,15 @@ def test_getitem(EN):
     with pytest.raises(IndexError):
         tokens[len(tokens)]
 
+    span = tokens[1:1]
+    assert not '/'.join(token.orth_ for token in span)
+    span = tokens[1:4]
+    assert '/'.join(token.orth_ for token in span) == 'it/back/!'
+    with pytest.raises(ValueError):
+        tokens[1:4:2]
+    with pytest.raises(ValueError):
+        tokens[1:4:-1]
+
 
 @pytest.mark.models
 def test_serialize(EN):

From 2fc33e8024487974c6fbc6941026b75f8e89a07b Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Tue, 6 Oct 2015 00:56:33 -0700
Subject: [PATCH 05/62] Allow step=1 when slicing a Doc

---
 spacy/tokens/doc.pyx            | 2 +-
 tests/tokens/test_tokens_api.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8a7d12555..ce278d868 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -87,7 +87,7 @@ cdef class Doc:
             token (Token):
         """
         if isinstance(i, slice):
-            if i.step is not None:
+            if not (i.step is None or i.step == 1):
                 raise ValueError("Stepped slices not supported in Span objects."
                                  "Try: list(doc)[start:stop:step] instead.")
             if i.start is None:
diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py
index a7311932f..fc1b52143 100644
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@@ -16,6 +16,8 @@ def test_getitem(EN):
     assert not '/'.join(token.orth_ for token in span)
     span = tokens[1:4]
     assert '/'.join(token.orth_ for token in span) == 'it/back/!'
+    span = tokens[1:4:1]
+    assert '/'.join(token.orth_ for token in span) == 'it/back/!'
     with pytest.raises(ValueError):
         tokens[1:4:2]
     with pytest.raises(ValueError):

From ef2af20cd373583b6d4ee6cc06ce8ca8406fba8c Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Tue, 6 Oct 2015 01:59:11 -0700
Subject: [PATCH 06/62] Make Doc's slicing behavior conform to Python
 conventions

---
 spacy/tokens/spans.pyx          |  8 +++++--
 tests/tokens/test_tokens_api.py | 40 ++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index c39f8976c..99efad4b9 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -16,9 +16,13 @@ cdef class Span:
     def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
                   vector_norm=None):
         if start < 0:
-            start = tokens.length - start
+            start = tokens.length + start
+        start = min(tokens.length, max(0, start))
+
         if end < 0:
-            end = tokens.length - end
+            end = tokens.length + end
+        end = min(tokens.length, max(start, end))
+
         self.doc = tokens
         self.start = start
         self.end = end
diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py
index fc1b52143..a272a8e3b 100644
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@@ -12,17 +12,51 @@ def test_getitem(EN):
     with pytest.raises(IndexError):
         tokens[len(tokens)]
 
+    def to_str(span):
+       return '/'.join(token.orth_ for token in span)
+
     span = tokens[1:1]
-    assert not '/'.join(token.orth_ for token in span)
+    assert not to_str(span)
     span = tokens[1:4]
-    assert '/'.join(token.orth_ for token in span) == 'it/back/!'
+    assert to_str(span) == 'it/back/!'
     span = tokens[1:4:1]
-    assert '/'.join(token.orth_ for token in span) == 'it/back/!'
+    assert to_str(span) == 'it/back/!'
     with pytest.raises(ValueError):
         tokens[1:4:2]
     with pytest.raises(ValueError):
         tokens[1:4:-1]
 
+    span = tokens[-3:6]
+    assert to_str(span) == 'He/pleaded'
+    span = tokens[4:-1]
+    assert to_str(span) == 'He/pleaded'
+    span = tokens[-5:-3]
+    assert to_str(span) == 'back/!'
+    span = tokens[5:4]
+    assert span.start == span.end == 5 and not to_str(span)
+    span = tokens[4:-3]
+    assert span.start == span.end == 4 and not to_str(span)
+
+    span = tokens[:]
+    assert to_str(span) == 'Give/it/back/!/He/pleaded/.'
+    span = tokens[4:]
+    assert to_str(span) == 'He/pleaded/.'
+    span = tokens[:4]
+    assert to_str(span) == 'Give/it/back/!'
+    span = tokens[:-3]
+    assert to_str(span) == 'Give/it/back/!'
+    span = tokens[-3:]
+    assert to_str(span) == 'He/pleaded/.'
+
+    span = tokens[4:50]
+    assert to_str(span) == 'He/pleaded/.'
+    span = tokens[-50:4]
+    assert to_str(span) == 'Give/it/back/!'
+    span = tokens[-50:-40]
+    assert span.start == span.end == 0 and not to_str(span)
+    span = tokens[40:50]
+    assert span.start == span.end == 7 and not to_str(span)
+
 
 @pytest.mark.models
 def test_serialize(EN):

From 5cc2f2b01ab26e313a7035f998fc1b4373cb6cc5 Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Tue, 6 Oct 2015 02:08:39 -0700
Subject: [PATCH 07/62] Test simple indexing for Span

---
 tests/tokens/test_tokens_api.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py
index a272a8e3b..34e54a2af 100644
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@@ -57,6 +57,9 @@ def test_getitem(EN):
     span = tokens[40:50]
     assert span.start == span.end == 7 and not to_str(span)
 
+    span = tokens[1:4]
+    assert span[0].orth_ == 'it'
+
 
 @pytest.mark.models
 def test_serialize(EN):

From 97685aecb735289de32c992e3659e503412aeeb5 Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Tue, 6 Oct 2015 02:45:49 -0700
Subject: [PATCH 08/62] Add slicing support to Span

---
 spacy/tokens/spans.pyx          | 21 ++++++++++++++++++++-
 tests/tokens/test_tokens_api.py | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index 99efad4b9..955d24ad4 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -50,7 +50,26 @@ cdef class Span:
             return 0
         return self.end - self.start
 
-    def __getitem__(self, int i):
+    def __getitem__(self, object i):
+        if isinstance(i, slice):
+            start, end, step = i.start, i.stop, i.step
+            if start is None:
+               start = 0
+            elif start < 0:
+               start += len(self)
+            start = min(len(self), max(0, start))
+
+            if end is None:
+               end = len(self)
+            elif end < 0:
+               end += len(self)
+            end = min(len(self), max(start, end))
+
+            start += self.start
+            end += self.start
+
+            return self.doc[start:end:i.step]
+
         if i < 0:
             return self.doc[self.end + i]
         else:
diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py
index 34e54a2af..675f00235 100644
--- a/tests/tokens/test_tokens_api.py
+++ b/tests/tokens/test_tokens_api.py
@@ -59,6 +59,24 @@ def test_getitem(EN):
 
     span = tokens[1:4]
     assert span[0].orth_ == 'it'
+    subspan = span[:]
+    assert to_str(subspan) == 'it/back/!'
+    subspan = span[:2]
+    assert to_str(subspan) == 'it/back'
+    subspan = span[1:]
+    assert to_str(subspan) == 'back/!'
+    subspan = span[:-1]
+    assert to_str(subspan) == 'it/back'
+    subspan = span[-2:]
+    assert to_str(subspan) == 'back/!'
+    subspan = span[1:2]
+    assert to_str(subspan) == 'back'
+    subspan = span[-2:-1]
+    assert to_str(subspan) == 'back'
+    subspan = span[-50:50]
+    assert to_str(subspan) == 'it/back/!'
+    subspan = span[50:-50]
+    assert subspan.start == subspan.end == 4 and not to_str(subspan)
 
 
 @pytest.mark.models

From 3fd3bc79aa7fa5c1c1ae360b49b3d2a1da6b0f36 Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Wed, 7 Oct 2015 01:25:35 -0700
Subject: [PATCH 09/62] Refactor to remove duplicate slicing logic

---
 spacy/tokens/doc.pyx   | 11 +++--------
 spacy/tokens/spans.pyx | 27 +++++----------------------
 spacy/util.py          | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index ce278d868..b78214ba9 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -21,6 +21,7 @@ from ..lexeme cimport Lexeme
 from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
+from ..util import normalize_slice
 
 
 DEF PADDING = 5
@@ -87,14 +88,8 @@ cdef class Doc:
             token (Token):
         """
         if isinstance(i, slice):
-            if not (i.step is None or i.step == 1):
-                raise ValueError("Stepped slices not supported in Span objects."
-                                 "Try: list(doc)[start:stop:step] instead.")
-            if i.start is None:
-                i = slice(0, i.stop)
-            if i.stop is None:
-                i = slice(i.start, len(self))
-            return Span(self, i.start, i.stop, label=0)
+            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
+            return Span(self, start, stop, label=0)
 
         if i < 0:
             i = self.length + i
diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index 955d24ad4..e8d2f2e59 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -9,19 +9,15 @@ from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t
 from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
+from ..util import normalize_slice
 
 
 cdef class Span:
     """A slice from a Doc object."""
     def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
                   vector_norm=None):
-        if start < 0:
-            start = tokens.length + start
-        start = min(tokens.length, max(0, start))
-
-        if end < 0:
-            end = tokens.length + end
-        end = min(tokens.length, max(start, end))
+        if not (0 <= start <= end <= len(tokens)):
+            raise IndexError
 
         self.doc = tokens
         self.start = start
@@ -52,23 +48,10 @@ cdef class Span:
 
     def __getitem__(self, object i):
         if isinstance(i, slice):
-            start, end, step = i.start, i.stop, i.step
-            if start is None:
-               start = 0
-            elif start < 0:
-               start += len(self)
-            start = min(len(self), max(0, start))
-
-            if end is None:
-               end = len(self)
-            elif end < 0:
-               end += len(self)
-            end = min(len(self), max(start, end))
-
+            start, end = normalize_slice(len(self), i.start, i.stop, i.step)
             start += self.start
             end += self.start
-
-            return self.doc[start:end:i.step]
+            return Span(self.doc, start, end)
 
         if i < 0:
             return self.doc[self.end + i]
diff --git a/spacy/util.py b/spacy/util.py
index 9f5b4fe04..449b06399 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -7,6 +7,26 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
 
 
+def normalize_slice(length, start, stop, step=None):
+    if not (step is None or step == 1):
+        raise ValueError("Stepped slices not supported in Span objects."
+                         "Try: list(tokens)[start:stop:step] instead.")
+    if start is None:
+       start = 0
+    elif start < 0:
+       start += length
+    start = min(length, max(0, start))
+
+    if stop is None:
+       stop = length
+    elif stop < 0:
+       stop += length
+    stop = min(length, max(start, stop))
+
+    assert 0 <= start <= stop <= length
+    return start, stop
+
+
 def utf8open(loc, mode='r'):
     return codecs.open(loc, mode, 'utf8')
 

From 0f601b8b750a8991d333a7a95f97b74b80b46846 Mon Sep 17 00:00:00 2001
From: "Yubing (Tom) Dong" <tom.tung.dyb@gmail.com>
Date: Wed, 7 Oct 2015 01:27:28 -0700
Subject: [PATCH 10/62] Update docstring of Doc.__getitem__

---
 spacy/tokens/doc.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index b78214ba9..eab6c044e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -82,10 +82,10 @@ cdef class Doc:
         self._vector = None
 
     def __getitem__(self, object i):
-        """Get a token.
+        """Get a Token or a Span from the Doc.
 
         Returns:
-            token (Token):
+            token (Token) or span (Span):
         """
         if isinstance(i, slice):
             start, stop = normalize_slice(len(self), i.start, i.stop, i.step)

From 5890682ed1676a5d6d1f27e6a95a740c8faf31f9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 8 Oct 2015 13:59:32 +1100
Subject: [PATCH 11/62] * Fix multi_word_matches script

---
 examples/multi_word_matches.py | 101 +++++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 17 deletions(-)

diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py
index 06cc313a9..59d3c2a63 100644
--- a/examples/multi_word_matches.py
+++ b/examples/multi_word_matches.py
@@ -22,6 +22,10 @@ our pattern set stays very small (exact size depends on the maximum length we're
 looking for, as the query language currently has no quantifiers)
 """
 from __future__ import print_function, unicode_literals, division
+from ast import literal_eval
+from bz2 import BZ2File
+import time
+import math
 
 import plac
 
@@ -30,22 +34,66 @@ from spacy.strings import hash_string
 from spacy.en import English
 from spacy.matcher import Matcher
 
-from spacy.attrs import FLAG63 as U_ENT
+from spacy.attrs import FLAG63 as B_ENT
 from spacy.attrs import FLAG62 as L_ENT
 from spacy.attrs import FLAG61 as I_ENT
-from spacy.attrs import FLAG60 as B_ENT
+
+from spacy.attrs import FLAG60 as B2_ENT
+from spacy.attrs import FLAG59 as B3_ENT
+from spacy.attrs import FLAG58 as B4_ENT
+from spacy.attrs import FLAG57 as B5_ENT
+from spacy.attrs import FLAG56 as B6_ENT
+from spacy.attrs import FLAG55 as B7_ENT
+from spacy.attrs import FLAG54 as B8_ENT
+from spacy.attrs import FLAG53 as B9_ENT
+from spacy.attrs import FLAG52 as B10_ENT
+
+from spacy.attrs import FLAG51 as I3_ENT
+from spacy.attrs import FLAG50 as I4_ENT
+from spacy.attrs import FLAG49 as I5_ENT
+from spacy.attrs import FLAG48 as I6_ENT
+from spacy.attrs import FLAG47 as I7_ENT
+from spacy.attrs import FLAG46 as I8_ENT
+from spacy.attrs import FLAG45 as I9_ENT
+from spacy.attrs import FLAG44 as I10_ENT
+
+from spacy.attrs import FLAG43 as L2_ENT
+from spacy.attrs import FLAG42 as L3_ENT
+from spacy.attrs import FLAG41 as L4_ENT
+from spacy.attrs import FLAG40 as L5_ENT
+from spacy.attrs import FLAG39 as L6_ENT
+from spacy.attrs import FLAG38 as L7_ENT
+from spacy.attrs import FLAG37 as L8_ENT
+from spacy.attrs import FLAG36 as L9_ENT
+from spacy.attrs import FLAG35 as L10_ENT
 
 
 def get_bilou(length):
     if length == 1:
         return [U_ENT]
-    else:
-        return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT]
+    elif length == 2:
+        return [B2_ENT, L2_ENT]
+    elif length == 3:
+        return [B3_ENT, I3_ENT, L3_ENT]
+    elif length == 4:
+        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
+    elif length == 5:
+        return [B5_ENT, I5_ENT, I5_ENT, L5_ENT]
+    elif length == 6:
+        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
+    elif length == 7:
+        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
+    elif length == 8:
+        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
+    elif length == 9:
+        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
+    elif length == 10:
+        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, L10_ENT]
 
 
 def make_matcher(vocab, max_length):
     abstract_patterns = []
-    for length in range(1, max_length+1):
+    for length in range(2, max_length):
         abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
     return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
 
@@ -66,29 +114,48 @@ def merge_matches(doc, matches):
         doc.merge(start, end, tag, text, 'MWE')
 
 
-def main():
-    nlp = English(parser=False, tagger=False, entity=False)
+def read_gazetteer(loc):
+    for line in open(loc):
+        phrase = literal_eval('u' + line.strip())
+        if ' (' in phrase and phrase.endswith(')'):
+            phrase = phrase.split(' (', 1)[0]
+        yield phrase
 
-    gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
-    example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
+def read_text(bz2_loc):
+    with BZ2File(bz2_loc) as file_:
+        for line in file_:
+            yield line.decode('utf8')
+
+def main(patterns_loc, text_loc):
+    nlp = English(parser=False, tagger=False, entity=False)
+    
     pattern_ids = PreshMap()
-    max_length = 0
-    for pattern_str in gazetteer:
+    max_length = 10
+    i = 0
+    for pattern_str in read_gazetteer(patterns_loc):
         pattern = nlp.tokenizer(pattern_str)
+        if len(pattern) < 2 or len(pattern) >= max_length:
+            continue
         bilou_tags = get_bilou(len(pattern))
         for word, tag in zip(pattern, bilou_tags):
             lexeme = nlp.vocab[word.orth]
             lexeme.set_flag(tag, True)
         pattern_ids[hash_string(pattern.text)] = True
-        max_length = max(max_length, len(pattern))
+        i += 1
+        if i >= 10000001:
+            break
 
     matcher = make_matcher(nlp.vocab, max_length)
 
-    doc = nlp(example_text)
-    matches = get_matches(matcher, pattern_ids, doc)
-    merge_matches(doc, matches)
-    for token in doc:
-        print(token.text, token.ent_type_)
+    t1 = time.time()
+        
+    for text in read_text(text_loc):
+        doc = nlp.tokenizer(text)
+        matches = get_matches(matcher, pattern_ids, doc)
+        merge_matches(doc, matches)
+    t2 = time.time()
+    print('10 ^ %d patterns took %d s' % (round(math.log(i, 10)), t2-t1))
+
     
 
 if __name__ == '__main__':

From 2d68f75b6a3ccca4f4f3cdda257eccc0f3c0e0ea Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 8 Oct 2015 13:59:56 +1100
Subject: [PATCH 12/62] * Fix identity tag map

---
 lang_data/fi/tag_map.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lang_data/fi/tag_map.json b/lang_data/fi/tag_map.json
index 6b21a1e29..4451d0fa0 100644
--- a/lang_data/fi/tag_map.json
+++ b/lang_data/fi/tag_map.json
@@ -13,5 +13,7 @@
     "ADP": {"pos": "ADP"},
     "SYM": {"pos": "SYM"},
     "X": {"pos": "X"},
-    "INTJ": {"pos": "INTJ"}
+    "INTJ": {"pos": "INTJ"},
+    "DET": {"pos": "DET"},
+    "PART": {"pos": "PART"}
 }

From e3e8994368322c6263f7ae797732e013e3cd6def Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 8 Oct 2015 14:00:13 +1100
Subject: [PATCH 13/62] * Patch italian tag map

---
 lang_data/it/tag_map.json | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/lang_data/it/tag_map.json b/lang_data/it/tag_map.json
index 514e978a6..92f11e457 100644
--- a/lang_data/it/tag_map.json
+++ b/lang_data/it/tag_map.json
@@ -2,43 +2,43 @@
 "S": {"pos": "NOUN"},
 "E":   {"pos": "ADP"},
 "RD":  {"pos": "DET"},
-"V":   {"pos": "VER"},
-"_":   {"pos": "_"},
+"V":   {"pos": "VERB"},
+"_":   {"pos": "NO_TAG"},
 "A":   {"pos": "ADJ"},
-"SP":  {"pos": "PROP"},
-"FF":  {"pos": "PUNC"},
-"FS":  {"pos": "PUNC"},
+"SP":  {"pos": "PROPN"},
+"FF":  {"pos": "PUNCT"},
+"FS":  {"pos": "PUNCT"},
 "B":   {"pos": "ADV"},
-"CC":  {"pos": "CON"},
-"FB":  {"pos": "PUNC"},
+"CC":  {"pos": "CONJ"},
+"FB":  {"pos": "PUNCT"},
 "VA":  {"pos": "AUX"},
-"PC":  {"pos": "PRO"},
+"PC":  {"pos": "PRON"},
 "N":   {"pos": "NUM"},
 "RI":  {"pos": "DET"},
-"PR":  {"pos": "PRO"},
-"CS":  {"pos": "SCON"},
+"PR":  {"pos": "PRON"},
+"CS":  {"pos": "SCONJ"},
 "BN":  {"pos": "ADV"},
 "AP":  {"pos": "DET"},
 "VM":  {"pos": "AUX"},
 "DI":  {"pos": "DET"},
-"FC":  {"pos": "PUNC"},
-"PI":  {"pos": "PRO"},
+"FC":  {"pos": "PUNCT"},
+"PI":  {"pos": "PRON"},
 "DD":  {"pos": "DET"},
 "DQ":  {"pos": "DET"},
-"PQ":  {"pos": "PRO"},
-"PD":  {"pos": "PRO"},
+"PQ":  {"pos": "PRON"},
+"PD":  {"pos": "PRON"},
 "NO":  {"pos": "ADJ"},
-"PE":  {"pos": "PRO"},
+"PE":  {"pos": "PRON"},
 "T":   {"pos": "DET"},
 "X":   {"pos": "SYM"},
 "SW":  {"pos": "X"},
-"NO":  {"pos": "PRO"},
-"I":   {"pos": "INT"},
+"NO":  {"pos": "PRON"},
+"I":   {"pos": "INTJ"},
 "X":   {"pos": "X"},
 "DR":  {"pos": "DET"},
 "EA":  {"pos": "ADP"},
-"PP":  {"pos": "PRO"},
+"PP":  {"pos": "PRON"},
 "X":   {"pos": "NUM"},
 "DE":  {"pos": "DET"},
-"X":   {"pos": "PAR"}
+"X":   {"pos": "PART"}
 }

From 4513bed175bf05a0eb0a4365c1bf934d4dde12d7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 8 Oct 2015 14:00:34 +1100
Subject: [PATCH 14/62] * Avoid compiling unused files

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index e6fbc246a..0c05d890b 100644
--- a/setup.py
+++ b/setup.py
@@ -156,8 +156,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
              'spacy.morphology', 'spacy.tagger',
              'spacy.syntax.stateclass', 
              'spacy._ml', 'spacy._theano',
-             'spacy.tokenizer', 'spacy.en.attrs',
-             'spacy.en.pos', 'spacy.syntax.parser', 
+             'spacy.tokenizer',
+             'spacy.syntax.parser', 
              'spacy.syntax.transition_system',
              'spacy.syntax.arc_eager',
              'spacy.syntax._parse_features',

From b3a70e63754210b13086cd488970d2f2d57d0092 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 8 Oct 2015 14:34:11 +1100
Subject: [PATCH 15/62] * Clean up unnecessary try/except block

---
 spacy/morphology.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index ddeca62d7..1a499aa0a 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -31,10 +31,7 @@ cdef class Morphology:
     cdef int assign_tag(self, TokenC* token, tag) except -1:
         cdef int tag_id
         if isinstance(tag, basestring):
-            try:
-                tag_id = self.reverse_index[self.strings[tag]]
-            except KeyError:
-                raise
+            tag_id = self.reverse_index[self.strings[tag]]
         else:
             tag_id = tag
         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)

From 1a71706c05127150e267070c811abffc782e72bb Mon Sep 17 00:00:00 2001
From: Quentin Pradet <quentin@pradet.me>
Date: Thu, 8 Oct 2015 14:22:23 +0400
Subject: [PATCH 16/62] Fix typo

---
 website/src/jade/blog/eli5-computers-learn-reading/index.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/src/jade/blog/eli5-computers-learn-reading/index.jade b/website/src/jade/blog/eli5-computers-learn-reading/index.jade
index 45d2d8bdd..4f3e9ebb1 100644
--- a/website/src/jade/blog/eli5-computers-learn-reading/index.jade
+++ b/website/src/jade/blog/eli5-computers-learn-reading/index.jade
@@ -24,7 +24,7 @@ include ./meta.jade
 
     p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.
 
-    p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
+    p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
 
     p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)
 

From 801d55a6d950f708a1911e84abff024c772ad466 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 02:00:45 +1100
Subject: [PATCH 17/62] * Fix phrase matcher

---
 spacy/matcher.pyx | 176 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 144 insertions(+), 32 deletions(-)

diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 88a4f9ba2..afafd3ddb 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -1,11 +1,18 @@
+# cython: profile=True
+from __future__ import unicode_literals
+
 from os import path
 
 from .typedefs cimport attr_t
+from .typedefs cimport hash_t
 from .attrs cimport attr_id_t
-from .structs cimport TokenC
+from .structs cimport TokenC, LexemeC
+from .lexeme cimport Lexeme
 
 from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
+from murmurhash.mrmr cimport hash64
 
 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
 from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
@@ -15,6 +22,38 @@ from .vocab cimport Vocab
 
 from libcpp.vector cimport vector
 
+from .attrs import FLAG61 as U_ENT
+
+from .attrs import FLAG60 as B2_ENT
+from .attrs import FLAG59 as B3_ENT
+from .attrs import FLAG58 as B4_ENT
+from .attrs import FLAG57 as B5_ENT
+from .attrs import FLAG56 as B6_ENT
+from .attrs import FLAG55 as B7_ENT
+from .attrs import FLAG54 as B8_ENT
+from .attrs import FLAG53 as B9_ENT
+from .attrs import FLAG52 as B10_ENT
+
+from .attrs import FLAG51 as I3_ENT
+from .attrs import FLAG50 as I4_ENT
+from .attrs import FLAG49 as I5_ENT
+from .attrs import FLAG48 as I6_ENT
+from .attrs import FLAG47 as I7_ENT
+from .attrs import FLAG46 as I8_ENT
+from .attrs import FLAG45 as I9_ENT
+from .attrs import FLAG44 as I10_ENT
+
+from .attrs import FLAG43 as L2_ENT
+from .attrs import FLAG42 as L3_ENT
+from .attrs import FLAG41 as L4_ENT
+from .attrs import FLAG40 as L5_ENT
+from .attrs import FLAG39 as L6_ENT
+from .attrs import FLAG38 as L7_ENT
+from .attrs import FLAG37 as L8_ENT
+from .attrs import FLAG36 as L9_ENT
+from .attrs import FLAG35 as L10_ENT
+
+
 try:
     import ujson as json
 except ImportError:
@@ -41,7 +80,7 @@ cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) exc
             pattern[i].spec[j].attr = attr
             pattern[i].spec[j].value = value
     i = len(token_specs)
-    pattern[i].spec = <AttrValue*>mem.alloc(1, sizeof(AttrValue))
+    pattern[i].spec = <AttrValue*>mem.alloc(2, sizeof(AttrValue))
     pattern[i].spec[0].attr = ENT_TYPE
     pattern[i].spec[0].value = entity_type
     pattern[i].spec[1].attr = LENGTH
@@ -81,7 +120,33 @@ def _convert_strings(token_specs, string_store):
                 value = int(value)
             converted[-1].append((attr, value))
     return converted
-    
+
+
+def get_bilou(length):
+    if length == 1:
+        return [U_ENT]
+    elif length == 2:
+        return [B2_ENT, L2_ENT]
+    elif length == 3:
+        return [B3_ENT, I3_ENT, L3_ENT]
+    elif length == 4:
+        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
+    elif length == 5:
+        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
+    elif length == 6:
+        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
+    elif length == 7:
+        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
+    elif length == 8:
+        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
+    elif length == 9:
+        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
+    elif length == 10:
+        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
+                I10_ENT, I10_ENT, L10_ENT]
+    else:
+        raise ValueError("Max length currently 10 for phrase matching")
+
 
 def map_attr_name(attr):
     attr = attr.upper()
@@ -95,32 +160,6 @@ def map_attr_name(attr):
         return SHAPE
     elif attr == 'NORM':
         return NORM
-    elif attr == 'FLAG13':
-        return FLAG13
-    elif attr == 'FLAG14':
-        return FLAG14
-    elif attr == 'FLAG15':
-        return FLAG15
-    elif attr == 'FLAG16':
-        return FLAG16
-    elif attr == 'FLAG17':
-        return FLAG17
-    elif attr == 'FLAG18':
-        return FLAG18
-    elif attr == 'FLAG19':
-        return FLAG19
-    elif attr == 'FLAG20':
-        return FLAG20
-    elif attr == 'FLAG21':
-        return FLAG21
-    elif attr == 'FLAG22':
-        return FLAG22
-    elif attr == 'FLAG23':
-        return FLAG23
-    elif attr == 'FLAG24':
-        return FLAG24
-    elif attr == 'FLAG25':
-        return FLAG25
     else:
         raise Exception("TODO: Finish supporting attr mapping %s" % attr)
 
@@ -163,7 +202,7 @@ cdef class Matcher:
             spec = _convert_strings(spec, self.vocab.strings)
             self.patterns.push_back(init_pattern(self.mem, spec, etype))
 
-    def __call__(self, Doc doc):
+    def __call__(self, Doc doc, acceptor=None):
         cdef vector[Pattern*] partials
         cdef int n_partials = 0
         cdef int q = 0
@@ -174,21 +213,94 @@ cdef class Matcher:
         for token_i in range(doc.length):
             token = &doc.data[token_i]
             q = 0
+            # Go over the open matches, extending or finalizing if able. Otherwise,
+            # we over-write them (q doesn't advance)
             for i in range(partials.size()):
                 state = partials.at(i)
                 if match(state, token):
                     if is_final(state):
-                        matches.append(get_entity(state, token, token_i))
+                        label, start, end = get_entity(state, token, token_i)
+                        if acceptor is None or acceptor(doc, label, start, end):
+                            matches.append((label, start, end))
                     else:
                         partials[q] = state + 1
                         q += 1
             partials.resize(q)
+            # Check whether we open any new patterns on this token
             for i in range(self.n_patterns):
                 state = self.patterns[i]
                 if match(state, token):
                     if is_final(state):
-                        matches.append(get_entity(state, token, token_i))
+                        label, start, end = get_entity(state, token, token_i)
+                        if acceptor is None or acceptor(doc, label, start, end):
+                            matches.append((label, start, end))
                     else:
                         partials.push_back(state + 1)
         doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches
         return matches
+
+
+cdef class PhraseMatcher:
+    cdef Pool mem
+    cdef Vocab vocab
+    cdef Matcher matcher
+    cdef PreshMap phrase_ids
+
+    cdef int max_length
+    cdef attr_t* _phrase_key
+
+    def __init__(self, Vocab vocab, phrases, max_length=10):
+        self.mem = Pool()
+        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
+        self.max_length = max_length
+        self.vocab = vocab
+        self.matcher = Matcher(self.vocab, {})
+        self.phrase_ids = PreshMap()
+        for phrase in phrases:
+            if len(phrase) < max_length:
+                self.add(phrase)
+
+        abstract_patterns = []
+        for length in range(1, max_length):
+            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
+        self.matcher.add('Candidate', 'MWE', {}, abstract_patterns)
+
+    def add(self, Doc tokens):
+        cdef int length = tokens.length
+        assert length < self.max_length
+        tags = get_bilou(length)
+        assert len(tags) == length, length
+        
+        cdef int i
+        for i in range(self.max_length):
+            self._phrase_key[i] = 0
+        for i, tag in enumerate(tags):
+            lexeme = self.vocab[tokens.data[i].lex.orth]
+            lexeme.set_flag(tag, True)
+            self._phrase_key[i] = lexeme.orth
+        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
+        self.phrase_ids[key] = True
+
+    def __call__(self, Doc doc):
+        matches = []
+        for label, start, end in self.matcher(doc, acceptor=self.accept_match):
+            cand = doc[start : end]
+            start = cand[0].idx
+            end = cand[-1].idx + len(cand[-1])
+            matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
+        for match in matches:
+            doc.merge(*match)
+        return matches
+
+    def accept_match(self, Doc doc, int label, int start, int end):
+        assert (end - start) < self.max_length
+        cdef int i, j
+        for i in range(self.max_length):
+            self._phrase_key[i] = 0
+        for i, j in enumerate(range(start, end)):
+            self._phrase_key[i] = doc.data[j].lex.orth
+        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
+        if self.phrase_ids.get(key):
+            return True
+        else:
+            return False

From 4bbc8f45c6e35b11744a896503568f888653f4bf Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 02:02:37 +1100
Subject: [PATCH 18/62] * Fix multi word matcher

---
 examples/multi_word_matches.py | 156 ++++++++++-----------------------
 1 file changed, 45 insertions(+), 111 deletions(-)

diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py
index 59d3c2a63..3c715736e 100644
--- a/examples/multi_word_matches.py
+++ b/examples/multi_word_matches.py
@@ -26,137 +26,71 @@ from ast import literal_eval
 from bz2 import BZ2File
 import time
 import math
+import codecs
 
 import plac
 
 from preshed.maps import PreshMap
+from preshed.counter import PreshCounter
 from spacy.strings import hash_string
 from spacy.en import English
-from spacy.matcher import Matcher
-
-from spacy.attrs import FLAG63 as B_ENT
-from spacy.attrs import FLAG62 as L_ENT
-from spacy.attrs import FLAG61 as I_ENT
-
-from spacy.attrs import FLAG60 as B2_ENT
-from spacy.attrs import FLAG59 as B3_ENT
-from spacy.attrs import FLAG58 as B4_ENT
-from spacy.attrs import FLAG57 as B5_ENT
-from spacy.attrs import FLAG56 as B6_ENT
-from spacy.attrs import FLAG55 as B7_ENT
-from spacy.attrs import FLAG54 as B8_ENT
-from spacy.attrs import FLAG53 as B9_ENT
-from spacy.attrs import FLAG52 as B10_ENT
-
-from spacy.attrs import FLAG51 as I3_ENT
-from spacy.attrs import FLAG50 as I4_ENT
-from spacy.attrs import FLAG49 as I5_ENT
-from spacy.attrs import FLAG48 as I6_ENT
-from spacy.attrs import FLAG47 as I7_ENT
-from spacy.attrs import FLAG46 as I8_ENT
-from spacy.attrs import FLAG45 as I9_ENT
-from spacy.attrs import FLAG44 as I10_ENT
-
-from spacy.attrs import FLAG43 as L2_ENT
-from spacy.attrs import FLAG42 as L3_ENT
-from spacy.attrs import FLAG41 as L4_ENT
-from spacy.attrs import FLAG40 as L5_ENT
-from spacy.attrs import FLAG39 as L6_ENT
-from spacy.attrs import FLAG38 as L7_ENT
-from spacy.attrs import FLAG37 as L8_ENT
-from spacy.attrs import FLAG36 as L9_ENT
-from spacy.attrs import FLAG35 as L10_ENT
+from spacy.matcher import PhraseMatcher
 
 
-def get_bilou(length):
-    if length == 1:
-        return [U_ENT]
-    elif length == 2:
-        return [B2_ENT, L2_ENT]
-    elif length == 3:
-        return [B3_ENT, I3_ENT, L3_ENT]
-    elif length == 4:
-        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
-    elif length == 5:
-        return [B5_ENT, I5_ENT, I5_ENT, L5_ENT]
-    elif length == 6:
-        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
-    elif length == 7:
-        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
-    elif length == 8:
-        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
-    elif length == 9:
-        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
-    elif length == 10:
-        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, L10_ENT]
-
-
-def make_matcher(vocab, max_length):
-    abstract_patterns = []
-    for length in range(2, max_length):
-        abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
-    return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
-
-
-def get_matches(matcher, pattern_ids, doc):
-    matches = []
-    for label, start, end in matcher(doc):
-        candidate = doc[start : end]
-        if pattern_ids[hash_string(candidate.text)] == True:
-            start = candidate[0].idx
-            end = candidate[-1].idx + len(candidate[-1])
-            matches.append((start, end, candidate.root.tag_, candidate.text))
-    return matches
-
-
-def merge_matches(doc, matches):
-    for start, end, tag, text in matches:
-        doc.merge(start, end, tag, text, 'MWE')
-
-
-def read_gazetteer(loc):
-    for line in open(loc):
+def read_gazetteer(tokenizer, loc, n=-1):
+    for i, line in enumerate(open(loc)):
         phrase = literal_eval('u' + line.strip())
         if ' (' in phrase and phrase.endswith(')'):
             phrase = phrase.split(' (', 1)[0]
-        yield phrase
+        if i >= n:
+            break
+        phrase = tokenizer(phrase)
+        if len(phrase) >= 2:
+            yield phrase
+
 
 def read_text(bz2_loc):
     with BZ2File(bz2_loc) as file_:
         for line in file_:
             yield line.decode('utf8')
 
-def main(patterns_loc, text_loc):
+
+def get_matches(tokenizer, phrases, texts, max_length=6):
+    matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
+    print("Match")
+    for text in texts:
+        doc = tokenizer(text)
+        matches = matcher(doc)
+        for mwe in doc.ents:
+            yield mwe
+
+
+def main(patterns_loc, text_loc, counts_loc, n=10000000):
     nlp = English(parser=False, tagger=False, entity=False)
-    
-    pattern_ids = PreshMap()
-    max_length = 10
-    i = 0
-    for pattern_str in read_gazetteer(patterns_loc):
-        pattern = nlp.tokenizer(pattern_str)
-        if len(pattern) < 2 or len(pattern) >= max_length:
-            continue
-        bilou_tags = get_bilou(len(pattern))
-        for word, tag in zip(pattern, bilou_tags):
-            lexeme = nlp.vocab[word.orth]
-            lexeme.set_flag(tag, True)
-        pattern_ids[hash_string(pattern.text)] = True
-        i += 1
-        if i >= 10000001:
-            break
-
-    matcher = make_matcher(nlp.vocab, max_length)
-
+    print("Make matcher")
+    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
+    counts = PreshCounter()
     t1 = time.time()
-        
-    for text in read_text(text_loc):
-        doc = nlp.tokenizer(text)
-        matches = get_matches(matcher, pattern_ids, doc)
-        merge_matches(doc, matches)
+    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
+        counts.inc(hash_string(mwe.text), 1)
     t2 = time.time()
-    print('10 ^ %d patterns took %d s' % (round(math.log(i, 10)), t2-t1))
-
+    print("10m tokens in %d s" % (t2 - t1))
+    
+    with codecs.open(counts_loc, 'w', 'utf8') as file_:
+        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
+            text = phrase.string
+            key = hash_string(text)
+            count = counts[key]
+            if count != 0:
+                file_.write('%d\t%s\n' % (count, text))
     
 
 if __name__ == '__main__':
-    plac.call(main)
+    if False:
+        import cProfile
+        import pstats
+        cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
+        s = pstats.Stats("Profile.prof")
+        s.strip_dirs().sort_stats("time").print_stats()
+    else:
+        plac.call(main)

From 5af4b62fe731758ae2b20fbd737a558f457ea6b9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 12:47:43 +1100
Subject: [PATCH 19/62] * Filter out phrases that consist of common, lower-case
 words.

---
 examples/multi_word_matches.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py
index 3c715736e..73f48bf42 100644
--- a/examples/multi_word_matches.py
+++ b/examples/multi_word_matches.py
@@ -45,6 +45,8 @@ def read_gazetteer(tokenizer, loc, n=-1):
         if i >= n:
             break
         phrase = tokenizer(phrase)
+        if all((t.is_lower and t.prob >= -10) for t in phrase):
+            continue
         if len(phrase) >= 2:
             yield phrase
 

From c64fd472b033f9551e89a74fe2851c6d3335c137 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 12:58:08 +1100
Subject: [PATCH 20/62] * Fix travis.yml

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 1ea1f8375..f21301db1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,4 +24,4 @@ install:
 
 # run tests
 script:
-  - "py.test tests/ website/tests/ -x"
+  - "py.test tests/ -x"

From 9ff288c7bba283d914ca70c62d3278a720f800b7 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 13:37:25 +1100
Subject: [PATCH 21/62] * Update tests, after removal of spacy.en.attrs

---
 tests/matcher/test_matcher_bugfixes.py | 1 +
 tests/tokens/test_array.py             | 2 +-
 tests/tokens/test_token_api.py         | 6 +++---
 tests/vocab/test_lexeme_flags.py       | 2 +-
 tests/website/test_api.py              | 2 +-
 tests/website/test_home.py             | 4 ++--
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/matcher/test_matcher_bugfixes.py b/tests/matcher/test_matcher_bugfixes.py
index c768021db..b65541460 100644
--- a/tests/matcher/test_matcher_bugfixes.py
+++ b/tests/matcher/test_matcher_bugfixes.py
@@ -3,6 +3,7 @@ import pytest
 
 from spacy.matcher import Matcher
 
+@pytest.mark.xfail
 def test_overlap_issue118(EN):
     '''Test a bug that arose from having overlapping matches'''
     doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night')
diff --git a/tests/tokens/test_array.py b/tests/tokens/test_array.py
index 29807c3e5..bdfdfd057 100644
--- a/tests/tokens/test_array.py
+++ b/tests/tokens/test_array.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 
 import pytest
 
-from spacy.en import attrs
+from spacy import attrs
 
 
 def test_attr_of_token(EN):
diff --git a/tests/tokens/test_token_api.py b/tests/tokens/test_token_api.py
index 99c99fc11..6deaadfbf 100644
--- a/tests/tokens/test_token_api.py
+++ b/tests/tokens/test_token_api.py
@@ -1,8 +1,8 @@
 from __future__ import unicode_literals
 from spacy.en import English
-from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
-from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
-from spacy.en.attrs import IS_STOP
+from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
+from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
+from spacy.attrs import IS_STOP
 
 import pytest
 
diff --git a/tests/vocab/test_lexeme_flags.py b/tests/vocab/test_lexeme_flags.py
index 844ee0aaa..5cc7bd16f 100644
--- a/tests/vocab/test_lexeme_flags.py
+++ b/tests/vocab/test_lexeme_flags.py
@@ -2,7 +2,7 @@ from __future__ import unicode_literals
 
 import pytest
 
-from spacy.en.attrs import *
+from spacy.attrs import *
 
 
 def test_is_alpha(en_vocab):
diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index 4ef1a54aa..37a48794b 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -60,7 +60,7 @@ def test_count_by(nlp):
     # from spacy.en import English, attrs
     # nlp = English()
     import numpy
-    from spacy.en import attrs
+    from spacy import attrs
     tokens = nlp('apple apple orange banana')
     assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1}
     assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529],
diff --git a/tests/website/test_home.py b/tests/website/test_home.py
index 515c64e6c..7d822d377 100644
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 import pytest
-import spacy.en
+import spacy
 
 
 @pytest.fixture()
@@ -45,7 +45,7 @@ def test_get_and_set_string_views_and_flags(nlp, token):
 
 
 def test_export_to_numpy_arrays(nlp, doc):
-    from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
+    from spacy.attrs import ORTH, LIKE_URL, IS_OOV
 
     attr_ids = [ORTH, LIKE_URL, IS_OOV]
     doc_array = doc.to_array(attr_ids)

From b125289f304ecbf47b825904971a9989a77d22d5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 13:46:57 +1100
Subject: [PATCH 22/62] * Fix type declaration in asciied function

---
 spacy/orth.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/orth.pyx b/spacy/orth.pyx
index 27123bb4e..882e06bf2 100644
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@@ -11,6 +11,7 @@ try:
 except ImportError:
     from text_unidecode import unidecode
 
+
 import re
 
 import math
@@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0):
 
 
 cpdef bytes asciied(unicode string):
-    cdef str stripped = unidecode(string)
+    stripped = unidecode(string)
     if not stripped:
         return b'???'
     return stripped.encode('ascii')

From 20b8c3e28172678bf497cca394adec3fdef990d1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 13:58:01 +1100
Subject: [PATCH 23/62] * Mark tests that require models

---
 tests/website/test_api.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index 37a48794b..50ec73827 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -26,6 +26,7 @@ def test_main_entry_point(nlp):
     doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
 
 
+@pytest.mark.models
 def test_sentence_spans(nlp):
     # from spacy.en import English
     # nlp = English()
@@ -33,6 +34,7 @@ def test_sentence_spans(nlp):
     assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
 
 
+@pytest.mark.models
 def test_entity_spans(nlp):
     # from spacy.en import English
     # nlp = English()
@@ -44,6 +46,7 @@ def test_entity_spans(nlp):
     assert ents[0].string == ents[0].string
 
 
+@pytest.mark.models
 def test_noun_chunk_spans(nlp):
     # from spacy.en import English
     # nlp = English()

From 7b340912d4a433d007e9397ace340f6ef652bef9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:09:26 +1100
Subject: [PATCH 24/62] * Mark tests that require models

---
 tests/website/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index 50ec73827..8b52ffff6 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -59,6 +59,7 @@ def test_noun_chunk_spans(nlp):
     # NP three noun chunks <-- has
 
 
+@pytest.mark.models
 def test_count_by(nlp):
     # from spacy.en import English, attrs
     # nlp = English()

From 76936a345617d85e964b227322dee3cc41554f58 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:19:07 +1100
Subject: [PATCH 25/62] * Mark tests that require models

---
 tests/website/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index 8b52ffff6..49f661850 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -92,6 +92,7 @@ def test_token_span(doc):
     assert token.i == 4
 
 
+@pytest.mark.models
 def test_example_i_like_new_york1(nlp):
     toks = nlp('I like New York in Autumn.')
 

From 5031440c35e06afd3fc4e859641063a34acb126f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:29:28 +1100
Subject: [PATCH 26/62] * Mark tests that require models

---
 tests/website/test_api.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index 49f661850..52910ae41 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -132,6 +132,7 @@ def dot(toks):
     return tok(toks, "dot")
 
 
+@pytest.mark.models
 def test_example_i_like_new_york3(toks, new, york):
     assert toks[new].head.orth_ == 'York'
     assert toks[york].head.orth_ == 'like'
@@ -142,6 +143,7 @@ def test_example_i_like_new_york4(toks, new, york):
     assert new_york.root.orth_ == 'York'
 
 
+@pytest.mark.models
 def test_example_i_like_new_york5(toks, autumn, dot):
     assert toks[autumn].head.orth_ == 'in'
     assert toks[dot].head.orth_ == 'like'
@@ -149,6 +151,7 @@ def test_example_i_like_new_york5(toks, autumn, dot):
     assert autumn_dot.root.orth_ == 'Autumn'
 
 
+@pytest.mark.models
 def test_navigating_the_parse_tree_lefts(doc):
     # TODO: where does the span object come from?
     span = doc[:2]
@@ -156,6 +159,7 @@ def test_navigating_the_parse_tree_lefts(doc):
              if span.doc[i].head in span]
 
 
+@pytest.mark.models
 def test_navigating_the_parse_tree_rights(doc):
     span = doc[:2]
     rights = [span.doc[i] for i in range(span.end, len(span.doc))

From dea40cfec34298e869a3d1241eb8f58ea11cef5d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:37:48 +1100
Subject: [PATCH 27/62] * Mark tests that require models

---
 tests/website/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index 52910ae41..ef0365d88 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -138,6 +138,7 @@ def test_example_i_like_new_york3(toks, new, york):
     assert toks[york].head.orth_ == 'like'
 
 
+@pytest.mark.models
 def test_example_i_like_new_york4(toks, new, york):
     new_york = toks[new:york+1]
     assert new_york.root.orth_ == 'York'

From 00c1992503203eb4f20a54841d6b59b3ce5da7dc Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:48:14 +1100
Subject: [PATCH 28/62] * Mark tests that require models

---
 tests/website/test_home.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/website/test_home.py b/tests/website/test_home.py
index 7d822d377..6c97b0f31 100644
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@@ -22,6 +22,7 @@ def test_get_tokens_and_sentences(doc):
     assert sentence.text == 'Hello, world.'
 
 
+@pytest.mark.models
 def test_use_integer_ids_for_any_strings(nlp, token):
     hello_id = nlp.vocab.strings['Hello']
     hello_str = nlp.vocab.strings[hello_id]
@@ -68,6 +69,7 @@ def test_word_vectors(nlp):
     assert apples.similarity(oranges) > boots.similarity(hippos)
 
 
+@pytest.mark.models
 def test_part_of_speech_tags(nlp):
     from spacy.parts_of_speech import ADV
 

From 4bae38128d5ef0487526239cfb0626c72d803984 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:58:34 +1100
Subject: [PATCH 29/62] * Remove license page from website in repo

---
 website/src/jade/license/index.jade | 38 -----------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 website/src/jade/license/index.jade

diff --git a/website/src/jade/license/index.jade b/website/src/jade/license/index.jade
deleted file mode 100644
index b31e99949..000000000
--- a/website/src/jade/license/index.jade
+++ /dev/null
@@ -1,38 +0,0 @@
-include ../header.jade
-
-mixin LicenseOption(name, period, price, audience)
-        .item
-            h4 #{name}
-                
-            .focus #{period}
-
-            span #{price}
-                
-            h5 Suggested for:
-                
-            span #{audience}
-                
-            a.button(href="/resources/pdf/spaCy_License_Agreement_2015.pdf", target="_blank") Download license
-
-            span or #[a(href="mailto:sales@spacy.io") get in touch]
-
-- var Page = InitPage(Site, Authors.spacy, "license", "License")
-
-+WritePage(Site, Authors.spacy, Page)
-    article.pricing
-        .box.license
-            +LicenseOption("Trial", "90 days", "$0", "Evaluation")
-            +LicenseOption("Production", "1 year", "$5,000", "Production")
-            +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning")
-
-        p.caption Researcher, hobbyist, or open-source developer? spaCy also offers #[a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3] licenses.
- 
-        blockquote.pull-quote
-            p Let's face it: Services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt.
-
-        p You need the source, and you need to know you can buy a long-term license. So that's what we offer. The difference between this and a black-box API is night and day.
-            
-        p Let's face it: services disappear.  Constantly. The good start-ups get bought; the bad ones go bankrupt.  Open-source projects become abandoned or bloated.  Google's graveyard is over-flowing &ndash; ditto for Yahoo!, Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset?
-
-        p A 5 year license won't expire until 2020.  spaCy will be with you for longer than most of your current staff.  If that's still not enough, get in touch. We can surely work something out.
-

From c5b2c4ead8d3c787e15cb6c451abd65ba8039593 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 14:58:45 +1100
Subject: [PATCH 30/62] * Don't build old license page

---
 website/Makefile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/website/Makefile b/website/Makefile
index 78dc9448c..940a8182c 100644
--- a/website/Makefile
+++ b/website/Makefile
@@ -12,9 +12,6 @@ site/index.html: src/jade/header.jade src/jade/*.jade
 site/docs/: src/jade/docs/*.jade src/jade/header.jade
 	jade -P src/jade/docs/index.jade --out $@
 
-site/license/: src/jade/license/*.jade src/jade/header.jade 
-	jade -P src/jade/license/index.jade --out $@
-
 site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade
 	jade -P src/jade/blog/index.jade --out $@
 

From 0e13f18ea4ca3916fff068b85b13454d4f72daf2 Mon Sep 17 00:00:00 2001
From: Henning Peters <pete@dexterslab.de>
Date: Fri, 9 Oct 2015 07:23:39 +0200
Subject: [PATCH 31/62] remove compile warning noise

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0c05d890b..a7c27fb74 100644
--- a/setup.py
+++ b/setup.py
@@ -138,7 +138,7 @@ VERSION = '0.93'
 def main(modules, is_pypy):
     language = "cpp"
     includes = ['.', path.join(sys.prefix, 'include')]
-    compile_args = ['-O3', '-Wno-strict-prototypes']
+    compile_args = ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']
     link_args = []
     if sys.prefix == 'darwin':
         compile_args.append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])

From b71ba2eed517942765956750b0afa7fc73849c55 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 18:43:14 +1100
Subject: [PATCH 32/62] * Add tests for unicode puncuation character
 lemmatization

---
 tests/tagger/test_lemmatizer.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py
index 8461a854e..df553c3d6 100644
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
@@ -34,3 +35,14 @@ def test_noun_lemmas(lemmatizer):
     assert do('planets') == set(['planet'])
     assert do('ring') == set(['ring'])
     assert do('axes') == set(['axis', 'axe', 'ax'])
+
+
+def test_smart_quotes(lemmatizer):
+    do = lemmatizer.punct
+    assert do('“') == set(['``'])
+    assert do('“') == set(['``'])
+
+
+def test_smart_quotes(lemmatizer):
+    do = lemmatizer.punct
+    assert do('–') == set(["--"])

From 5332c0b697f68eb7d25221dc722d1d5ee65a479e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 18:44:21 +1100
Subject: [PATCH 33/62] * Add support for punctuation lemmatization, to handle
 unicode characters. This should help in addressing Issue #130

---
 lang_data/en/lemma_rules.json |  6 ++++++
 spacy/lemmatizer.py           | 11 ++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index c45eb1df6..498240be1 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -27,5 +27,11 @@
         ["est", ""],
         ["er", "e"],
         ["est", "e"]
+    ],
+
+    "punct": [
+        ["“", "``"],
+        ["”", "''"],
+        ["–", "--"]
     ]
 }
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index ed04e2d77..c1d296d7c 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -1,4 +1,4 @@
-from __future__ import unicode_literals
+from __future__ import unicode_literals, print_function
 from os import path
 import codecs
 
@@ -7,7 +7,7 @@ try:
 except ImportError:
     import json
 
-from .parts_of_speech import NOUN, VERB, ADJ
+from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
 
 
 class Lemmatizer(object):
@@ -36,6 +36,8 @@ class Lemmatizer(object):
             pos = 'verb'
         elif pos == ADJ:
             pos = 'adj'
+        elif pos == PUNCT:
+            pos = 'punct'
         lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, []))
         return lemmas
 
@@ -48,6 +50,9 @@ class Lemmatizer(object):
     def adj(self, string):
         return self(string, 'adj')
 
+    def punct(self, string):
+        return self(string, 'punct')
+
 
 def lemmatize(string, index, exceptions, rules):
     string = string.lower()
@@ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules):
     for old, new in rules:
         if string.endswith(old):
             form = string[:len(string) - len(old)] + new
-            if form in index:
+            if form in index or not form.isalpha():
                 forms.append(form)
     if not forms:
         forms.append(string)

From 2d9e5bf566be7a9a6706d1ef2b01f63a2bde7f2e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:02:42 +1100
Subject: [PATCH 34/62] * Allow punctuation to be lemmatized

---
 spacy/morphology.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 1a499aa0a..534f64a59 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -7,7 +7,7 @@ except ImportError:
     import json
 
 from .parts_of_speech import UNIV_POS_NAMES
-from .parts_of_speech cimport ADJ, VERB, NOUN
+from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 
 
 cdef class Morphology:
@@ -81,7 +81,7 @@ cdef class Morphology:
         if self.lemmatizer is None:
             return orth
         cdef unicode py_string = self.strings[orth]
-        if pos != NOUN and pos != VERB and pos != ADJ:
+        if pos != NOUN and pos != VERB and pos != ADJ and pos != PUNCT:
             return orth
         cdef set lemma_strings
         cdef unicode lemma_string

From 1842a53e73405be3048e6dd26afcfc2e4d5da5ee Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:09:36 +1100
Subject: [PATCH 35/62] * Lemmatize smart quotes as plain quotes

---
 lang_data/en/lemma_rules.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index 498240be1..0336b6b9f 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -30,8 +30,8 @@
     ],
 
     "punct": [
-        ["“", "``"],
-        ["”", "''"],
+        ["“", "\""],
+        ["”", "\""],
         ["–", "--"]
     ]
 }

From 1490feda292d8065c01bb2136be5c30bbf5b23eb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:23:47 +1100
Subject: [PATCH 36/62] * Make generate_specials pretty-print the specials.json
 file

---
 lang_data/en/generate_specials.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index 1a8f1ae0b..6ad503aec 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -1,3 +1,4 @@
+# -#- coding: utf-8 -*-
 import json
 
 contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}
@@ -132,7 +133,6 @@ hardcoded_specials = {
                 "Mt.": [{"F": "Mt.", "L": "Mount"}],
 
                 "''": [{"F": "''"}],
-
                 "Corp.": [{"F": "Corp."}],
                 "Inc.": [{"F": "Inc."}],
                 "Co.": [{"F": "Co."}],
@@ -412,6 +412,6 @@ def generate_specials():
 
 if __name__ == "__main__":
     specials = generate_specials()
-    with open("specials.json", "w") as f:
-        json.dump(specials, f)
+    with open("specials.json", "w") as file_:
+        file_.write(json.dumps(specials, indent=2))
 

From 393a13d1af2a0c22a04643e61e7c4b95b653250b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 19:24:33 +1100
Subject: [PATCH 37/62] * Add unicode em dash to specials.json, so that we can
 control what POS tag it gets. This way we can prevent sentence boundary
 detection errors, to address Issue #130.

---
 lang_data/en/generate_specials.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index 6ad503aec..e50cd77d4 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -133,6 +133,9 @@ hardcoded_specials = {
                 "Mt.": [{"F": "Mt.", "L": "Mount"}],
 
                 "''": [{"F": "''"}],
+
+                "—": [{"F": "—", "L": "--", "P": ":"}],
+
                 "Corp.": [{"F": "Corp."}],
                 "Inc.": [{"F": "Inc."}],
                 "Co.": [{"F": "Co."}],

From b6047afe4ca23e48fef28c08c40e91a6bd9c61b6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 10:25:37 +0200
Subject: [PATCH 38/62] * Fix punctuation lemma rules, to resolve Issue #130

---
 lang_data/en/lemma_rules.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index 498240be1..5a431224d 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -30,8 +30,8 @@
     ],
 
     "punct": [
-        ["“", "``"],
-        ["”", "''"],
-        ["–", "--"]
+        ["“", "\""],
+        ["”", "\""],
+        ["—", "--"]
     ]
 }

From 0e92e8574ac4345f2b5d18a3dd182eaa69e43466 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 11:06:37 +0200
Subject: [PATCH 39/62] * Fix pos tag in em-dash in specials

---
 lang_data/en/generate_specials.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index e50cd77d4..db3827593 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -134,7 +134,7 @@ hardcoded_specials = {
 
                 "''": [{"F": "''"}],
 
-                "—": [{"F": "—", "L": "--", "P": ":"}],
+                "—": [{"F": "—", "L": "--", "pos": ":"}],
 
                 "Corp.": [{"F": "Corp."}],
                 "Inc.": [{"F": "Inc."}],

From 49600a44a84dee1ac044578d99c02ffb50cd8b27 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 11:06:57 +0200
Subject: [PATCH 40/62] * Fix trailing comma in lemma_rules.json

---
 lang_data/en/lemma_rules.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index 140749b18..1d7366f92 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -31,6 +31,6 @@
 
     "punct": [
         ["“", "\""],
-        ["”", "\""],
+        ["”", "\""]
     ]
 }

From a510858f5a516447fa050223fe27773f4c55fa79 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 11:07:45 +0200
Subject: [PATCH 41/62] * Pretty-print specials.json, and add the em dash

---
 lang_data/en/specials.json | 4864 +++++++++++++++++++++++++++++++++++-
 1 file changed, 4863 insertions(+), 1 deletion(-)

diff --git a/lang_data/en/specials.json b/lang_data/en/specials.json
index 93672dc10..20d90e261 100644
--- a/lang_data/en/specials.json
+++ b/lang_data/en/specials.json
@@ -1 +1,4863 @@
-{"i've": [{"L": "-PRON-", "F": "i"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Where's": [{"F": "Where"}, {"F": "'s"}], "4p.m.": [{"F": "4"}, {"F": "p.m."}], "12am": [{"F": "12"}, {"L": "a.m.", "F": "am"}], "j.": [{"F": "j."}], "8pm": [{"F": "8"}, {"L": "p.m.", "F": "pm"}], "E.G.": [{"F": "E.G."}], "must've": [{"F": "must"}, {"L": "have", "pos": "VB", "F": "'ve"}], "D.C.": [{"F": "D.C."}], "She'd've": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "mightnt": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "nt"}], "Hes": [{"L": "-PRON-", "F": "He"}, {"F": "s"}], "7a.m.": [{"F": "7"}, {"F": "a.m."}], "Idve": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ill.": [{"F": "Ill."}], ":P": [{"F": ":P"}], "\t": [{"pos": "SP", "F": "\t"}], "10a.m.": [{"F": "10"}, {"F": "a.m."}], "would've": [{"F": "would"}, {"L": "have", "pos": "VB", "F": "'ve"}], "11am": [{"F": "11"}, {"L": "a.m.", "F": "am"}], "you'd": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "'d"}], "Thered": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "d"}], "havent": [{"pos": "VB", "F": "have"}, {"L": "not", "pos": "RB", "F": "nt"}], "im": [{"L": "-PRON-", "F": "i"}, {"L": "be", "F": "m", "pos": "VBP", "tenspect": 1, "number": 1}], "Whatll": [{"F": "What"}, {"L": "will", "pos": "MD", "F": "ll"}], "there'd": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "'d"}], "Mustn't": [{"F": "Must"}, {"L": "not", "pos": "RB", "F": "n't"}], "haven't": [{"pos": "VB", "F": "have"}, {"L": "not", "pos": "RB", "F": "n't"}], "hows": [{"F": "how"}, {"F": "s"}], "Doesn't": [{"L": "do", "pos": "VBZ", "F": "Does"}, {"L": "not", "pos": "RB", "F": "n't"}], "You're": [{"L": "-PRON-", "F": "You"}, {"F": "'re"}], "he's": [{"L": "-PRON-", "F": "he"}, {"F": "'s"}], "Mo.": [{"F": "Mo."}], "Theydve": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "We're": [{"F": "We"}, {"F": "'re"}], "can't": [{"L": "can", "pos": "MD", "F": "ca"}, {"L": "not", "pos": "RB", "F": "n't"}], "they've": [{"L": "-PRON-", "F": "they"}, {"L": "have", "pos": "VB", "F": "'ve"}], "werent": [{"F": "were"}, {"L": "not", "pos": "RB", "F": "nt"}], "i'm": [{"L": "-PRON-", "F": "i"}, {"L": "be", "F": "'m", "pos": "VBP", "tenspect": 1, "number": 1}], "Wouldve": [{"F": "Would"}, {"L": "have", "pos": "VB", "F": "ve"}], "Inc.": [{"F": "Inc."}], "Isnt": [{"L": "be", "pos": "VBZ", "F": "Is"}, {"L": "not", "pos": "RB", "F": "nt"}], "mightn't": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "n't"}], "itd": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "d"}], "^_^": [{"F": "^_^"}], "4pm": [{"F": "4"}, {"L": "p.m.", "F": "pm"}], "theyd": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "d"}], "p.": [{"F": "p."}], "Hasnt": [{"F": "Has"}, {"L": "not", "pos": "RB", "F": "nt"}], "how'd": [{"F": "how"}, {"L": "would", "pos": "MD", "F": "'d"}], "you'll": [{"L": "-PRON-", "F": "you"}, {"L": "will", "pos": "MD", "F": "'ll"}], "how's": [{"F": "how"}, {"F": "'s"}], "e.g.": [{"F": "e.g."}], "didn't": [{"L": "do", "pos": "VBD", "F": "did"}, {"L": "not", "pos": "RB", "F": "n't"}], "6pm": [{"F": "6"}, {"L": "p.m.", "F": "pm"}], "z.": [{"F": "z."}], "Howll": [{"F": "How"}, {"L": "will", "pos": "MD", "F": "ll"}], "Shant": [{"F": "Sha"}, {"L": "not", "pos": "RB", "F": "nt"}], "Theyd": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "d"}], "f.": [{"F": "f."}], "u.": [{"F": "u."}], "she'd": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "'d"}], "Fla.": [{"F": "Fla."}], "Rep.": [{"F": "Rep."}], "they're": [{"L": "-PRON-", "F": "they"}, {"F": "'re"}], "you'd've": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mightve": [{"F": "Might"}, {"L": "have", "pos": "VB", "F": "ve"}], "Why'll": [{"F": "Why"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Should've": [{"F": "Should"}, {"L": "have", "pos": "VB", "F": "'ve"}], "that's": [{"F": "that"}, {"F": "'s"}], "9pm": [{"F": "9"}, {"L": "p.m.", "F": "pm"}], "Mass.": [{"F": "Mass."}], "there's": [{"F": "there"}, {"F": "'s"}], "It'd": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "'d"}], "hasn't": [{"F": "has"}, {"L": "not", "pos": "RB", "F": "n't"}], "shes": [{"L": "-PRON-", "F": "she"}, {"F": "s"}], "she'd've": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "o.O": [{"F": "o.O"}], "whered": [{"F": "where"}, {"L": "would", "pos": "MD", "F": "d"}], ":(((": [{"F": ":((("}], "N.C.": [{"F": "N.C."}], "you're": [{"L": "-PRON-", "F": "you"}, {"F": "'re"}], ":0": [{"F": ":0"}], "Wouldn't": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "n't"}], "'em": [{"F": "'em"}], "Whatve": [{"F": "What"}, {"L": "have", "pos": "VB", "F": "ve"}], "Corp.": [{"F": "Corp."}], "i'ma": [{"L": "-PRON-", "F": "i"}, {"F": "'ma"}], "''": [{"F": "''"}], "v.": [{"F": "v."}], "Ga.": [{"F": "Ga."}], "1am": [{"F": "1"}, {"L": "a.m.", "F": "am"}], "Wasnt": [{"F": "Was"}, {"L": "not", "pos": "RB", "F": "nt"}], "q.": [{"F": "q."}], "Hows": [{"F": "How"}, {"F": "s"}], "why're": [{"F": "why"}, {"F": "'re"}], ";-p": [{"F": ";-p"}], "Ima": [{"L": "-PRON-", "F": "I"}, {"F": "ma"}], "neednt": [{"F": "need"}, {"L": "not", "pos": "RB", "F": "nt"}], "Ariz.": [{"F": "Ariz."}], "8am": [{"F": "8"}, {"L": "a.m.", "F": "am"}], "Aren't": [{"L": "be", "pos": "VBP", "F": "Are", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "4am": [{"F": "4"}, {"L": "a.m.", "F": "am"}], "she'll": [{"L": "-PRON-", "F": "she"}, {"L": "will", "pos": "MD", "F": "'ll"}], "8p.m.": [{"F": "8"}, {"F": "p.m."}], "9p.m.": [{"F": "9"}, {"F": "p.m."}], "11p.m.": [{"F": "11"}, {"F": "p.m."}], "Who'd": [{"F": "Who"}, {"L": "would", "pos": "MD", "F": "'d"}], "St.": [{"F": "St."}], "It's": [{"L": "-PRON-", "F": "It"}, {"F": "'s"}], "Gen.": [{"F": "Gen."}], "Messrs.": [{"F": "Messrs."}], "Calif.": [{"F": "Calif."}], "youdve": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "i'll": [{"L": "-PRON-", "F": "i"}, {"L": "will", "pos": "MD", "F": "'ll"}], "whatll": [{"F": "what"}, {"L": "will", "pos": "MD", "F": "ll"}], "mightntve": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldnt": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "nt"}], "Hasn't": [{"F": "Has"}, {"L": "not", "pos": "RB", "F": "n't"}], "hasnt": [{"F": "has"}, {"L": "not", "pos": "RB", "F": "nt"}], "shouldnt": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "nt"}], "Haven't": [{"pos": "VB", "F": "Have"}, {"L": "not", "pos": "RB", "F": "n't"}], "wedve": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Must've": [{"F": "Must"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Minn.": [{"F": "Minn."}], "s.": [{"F": "s."}], "isnt": [{"L": "be", "pos": "VBZ", "F": "is"}, {"L": "not", "pos": "RB", "F": "nt"}], "He'd've": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "o_o": [{"F": "o_o"}], "let's": [{"F": "let"}, {"F": "'s"}], "They've": [{"L": "-PRON-", "F": "They"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Co.": [{"F": "Co."}], "p.m.": [{"F": "p.m."}], "we're": [{"F": "we"}, {"F": "'re"}], "May.": [{"F": "May."}], "Ala.": [{"F": "Ala."}], "10am": [{"F": "10"}, {"L": "a.m.", "F": "am"}], "itll": [{"L": "-PRON-", "F": "it"}, {"L": "will", "pos": "MD", "F": "ll"}], "n.": [{"F": "n."}], "5pm": [{"F": "5"}, {"L": "p.m.", "F": "pm"}], "hedve": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Shan't": [{"F": "Sha"}, {"L": "not", "pos": "RB", "F": "n't"}], "Wont": [{"F": "Wo"}, {"L": "not", "pos": "RB", "F": "nt"}], "'S": [{"L": "'s", "F": "'S"}], ";(": [{"F": ";("}], "Mightn't've": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "needn't": [{"F": "need"}, {"L": "not", "pos": "RB", "F": "n't"}], "Shes": [{"L": "-PRON-", "F": "She"}, {"F": "s"}], "he'll": [{"L": "-PRON-", "F": "he"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Whereve": [{"F": "Where"}, {"L": "have", "pos": "VB", "F": "ve"}], "aint": [{"L": "be", "pos": "VBP", "F": "ai", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "Colo.": [{"F": "Colo."}], "who've": [{"F": "who"}, {"L": "have", "pos": "VB", "F": "'ve"}], "it'd": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "'d"}], "theyll": [{"L": "-PRON-", "F": "they"}, {"L": "will", "pos": "MD", "F": "ll"}], "wont": [{"F": "wo"}, {"L": "not", "pos": "RB", "F": "nt"}], "whyre": [{"F": "why"}, {"F": "re"}], "Nev.": [{"F": "Nev."}], "Dec.": [{"F": "Dec."}], "whereve": [{"F": "where"}, {"L": "have", "pos": "VB", "F": "ve"}], "Cant": [{"L": "can", "pos": "MD", "F": "Ca"}, {"L": "not", "pos": "RB", "F": "nt"}], "1a.m.": [{"F": "1"}, {"F": "a.m."}], "i.e.": [{"F": "i.e."}], "3am": [{"F": "3"}, {"L": "a.m.", "F": "am"}], "Won't": [{"F": "Wo"}, {"L": "not", "pos": "RB", "F": "n't"}], "hes": [{"L": "-PRON-", "F": "he"}, {"F": "s"}], "Let's": [{"F": "Let"}, {"F": "'s"}], "I'll": [{"L": "-PRON-", "F": "I"}, {"L": "will", "pos": "MD", "F": "'ll"}], "We'll": [{"F": "We"}, {"L": "will", "pos": "MD", "F": "'ll"}], "who'd": [{"F": "who"}, {"L": "would", "pos": "MD", "F": "'d"}], "E.g.": [{"F": "E.g."}], "we'd": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "'d"}], "Theyre": [{"L": "-PRON-", "F": "They"}, {"F": "re"}], "She's": [{"L": "-PRON-", "F": "She"}, {"F": "'s"}], "Whod": [{"F": "Who"}, {"L": "would", "pos": "MD", "F": "d"}], "Itll": [{"L": "-PRON-", "F": "It"}, {"L": "will", "pos": "MD", "F": "ll"}], "couldn't've": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "How'd": [{"F": "How"}, {"L": "would", "pos": "MD", "F": "'d"}], "wouldve": [{"F": "would"}, {"L": "have", "pos": "VB", "F": "ve"}], "shan't": [{"F": "sha"}, {"L": "not", "pos": "RB", "F": "n't"}], "8a.m.": [{"F": "8"}, {"F": "a.m."}], "Havent": [{"pos": "VB", "F": "Have"}, {"L": "not", "pos": "RB", "F": "nt"}], "-__-": [{"F": "-__-"}], "6am": [{"F": "6"}, {"L": "a.m.", "F": "am"}], "Hadntve": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "10p.m.": [{"F": "10"}, {"F": "p.m."}], "Might've": [{"F": "Might"}, {"L": "have", "pos": "VB", "F": "'ve"}], "N.M.": [{"F": "N.M."}], "shouldn't": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "n't"}], "(^_^)": [{"F": "(^_^)"}], "x.": [{"F": "x."}], "where've": [{"F": "where"}, {"L": "have", "pos": "VB", "F": "'ve"}], ";)": [{"F": ";)"}], "theydve": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "dont": [{"L": "do", "F": "do"}, {"L": "not", "pos": "RB", "F": "nt"}], "wouldn't": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "n't"}], "g.": [{"F": "g."}], "Who've": [{"F": "Who"}, {"L": "have", "pos": "VB", "F": "'ve"}], "might've": [{"F": "might"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Who's": [{"F": "Who"}, {"F": "'s"}], "Theyve": [{"L": "-PRON-", "F": "They"}, {"L": "have", "pos": "VB", "F": "ve"}], "2p.m.": [{"F": "2"}, {"F": "p.m."}], "shouldn't've": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "hed": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "d"}], "1p.m.": [{"F": "1"}, {"F": "p.m."}], "We've": [{"F": "We"}, {"L": "have", "pos": "VB", "F": "'ve"}], "a.": [{"F": "a."}], "<333": [{"F": "<333"}], "l.": [{"F": "l."}], "It'll": [{"L": "-PRON-", "F": "It"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Jun.": [{"F": "Jun."}], "Mrs.": [{"F": "Mrs."}], "what's": [{"F": "what"}, {"F": "'s"}], "N.Y.": [{"F": "N.Y."}], "Why're": [{"F": "Why"}, {"F": "'re"}], "Wis.": [{"F": "Wis."}], "Hedve": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Kans.": [{"F": "Kans."}], "idve": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "We'd've": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Dont": [{"L": "do", "F": "Do"}, {"L": "not", "pos": "RB", "F": "nt"}], ":')": [{"F": ":')"}], "(=": [{"F": "(="}], "won't": [{"F": "wo"}, {"L": "not", "pos": "RB", "F": "n't"}], "who'll": [{"F": "who"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Not've": [{"L": "not", "pos": "RB", "F": "Not"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Gov.": [{"F": "Gov."}], "couldntve": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Doesnt": [{"L": "do", "pos": "VBZ", "F": "Does"}, {"L": "not", "pos": "RB", "F": "nt"}], "11a.m.": [{"F": "11"}, {"F": "a.m."}], "I.e.": [{"F": "I.e."}], "wasn't": [{"F": "was"}, {"L": "not", "pos": "RB", "F": "n't"}], "5am": [{"F": "5"}, {"L": "a.m.", "F": "am"}], "Shouldve": [{"F": "Should"}, {"L": "have", "pos": "VB", "F": "ve"}], "Jan.": [{"F": "Jan."}], "she's": [{"L": "-PRON-", "F": "she"}, {"F": "'s"}], "We'd": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "'d"}], "Itd": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "d"}], "What's": [{"F": "What"}, {"F": "'s"}], "e.": [{"F": "e."}], "7p.m.": [{"F": "7"}, {"F": "p.m."}], "Wholl": [{"F": "Who"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadntve": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Where'd": [{"F": "Where"}, {"L": "would", "pos": "MD", "F": "'d"}], ":-)": [{"F": ":-)"}], "whos": [{"F": "who"}, {"F": "s"}], "mustn't": [{"F": "must"}, {"L": "not", "pos": "RB", "F": "n't"}], "shouldntve": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Youdve": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "mustnt": [{"F": "must"}, {"L": "not", "pos": "RB", "F": "nt"}], "Oct.": [{"F": "Oct."}], "a.m.": [{"F": "a.m."}], "wouldn't've": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "k.": [{"F": "k."}], "Hadn't've": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "who're": [{"F": "who"}, {"F": "'re"}], "6a.m.": [{"F": "6"}, {"F": "a.m."}], "Rev.": [{"F": "Rev."}], "Del.": [{"F": "Del."}], "Ind.": [{"F": "Ind."}], "couldn't": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "n't"}], "La.": [{"F": "La."}], "It'd've": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "t.": [{"F": "t."}], "don't": [{"L": "do", "F": "do"}, {"L": "not", "pos": "RB", "F": "n't"}], "Mightnt": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "nt"}], ":3": [{"F": ":3"}], "shouldve": [{"F": "should"}, {"L": "have", "pos": "VB", "F": "ve"}], "notve": [{"L": "not", "pos": "RB", "F": "not"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldn't've": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Aint": [{"L": "be", "pos": "VBP", "F": "Ai", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "wheres": [{"F": "where"}, {"F": "s"}], "Don't": [{"L": "do", "F": "Do"}, {"L": "not", "pos": "RB", "F": "n't"}], "Theredve": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Could've": [{"pos": "MD", "F": "Could"}, {"L": "have", "pos": "VB", "F": "'ve"}], "d.": [{"F": "d."}], "Wouldnt": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "nt"}], "They're": [{"L": "-PRON-", "F": "They"}, {"F": "'re"}], "There's": [{"F": "There"}, {"F": "'s"}], "Mr.": [{"F": "Mr."}], "shant": [{"F": "sha"}, {"L": "not", "pos": "RB", "F": "nt"}], "how'll": [{"F": "how"}, {"L": "will", "pos": "MD", "F": "'ll"}], "'s": [{"L": "'s", "F": "'s"}], "whens": [{"F": "when"}, {"F": "s"}], ";p": [{"F": ";p"}], "Youll": [{"L": "-PRON-", "F": "You"}, {"L": "will", "pos": "MD", "F": "ll"}], "Wheres": [{"F": "Where"}, {"F": "s"}], ":p": [{"F": ":p"}], ":-P": [{"F": ":-P"}], "Dr.": [{"F": "Dr."}], "they'd": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "'d"}], "Whatre": [{"F": "What"}, {"F": "re"}], ";-)": [{"F": ";-)"}], "N.D.": [{"F": "N.D."}], "I'ma": [{"L": "-PRON-", "F": "I"}, {"F": "'ma"}], "N.H.": [{"F": "N.H."}], "Wasn't": [{"F": "Was"}, {"L": "not", "pos": "RB", "F": "n't"}], "itdve": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Didnt": [{"L": "do", "pos": "VBD", "F": "Did"}, {"L": "not", "pos": "RB", "F": "nt"}], "Ark.": [{"F": "Ark."}], ":>": [{"F": ":>"}], "Wouldntve": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "6p.m.": [{"F": "6"}, {"F": "p.m."}], "where'd": [{"F": "where"}, {"L": "would", "pos": "MD", "F": "'d"}], ":))": [{"F": ":))"}], ":/": [{"F": ":/"}], "1pm": [{"F": "1"}, {"L": "p.m.", "F": "pm"}], "should've": [{"F": "should"}, {"L": "have", "pos": "VB", "F": "'ve"}], "2am": [{"F": "2"}, {"L": "a.m.", "F": "am"}], "ain't": [{"L": "be", "pos": "VBP", "F": "ai", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "Nov.": [{"F": "Nov."}], "didnt": [{"L": "do", "pos": "VBD", "F": "did"}, {"L": "not", "pos": "RB", "F": "nt"}], "4a.m.": [{"F": "4"}, {"F": "a.m."}], "co.": [{"F": "co."}], "i.": [{"F": "i."}], "when's": [{"F": "when"}, {"F": "'s"}], "wouldntve": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "mightve": [{"F": "might"}, {"L": "have", "pos": "VB", "F": "ve"}], "howll": [{"F": "how"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadn't": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "n't"}], "I'd've": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Feb.": [{"F": "Feb."}], "howd": [{"F": "how"}, {"L": "would", "pos": "MD", "F": "d"}], "it'd've": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "isn't": [{"L": "be", "pos": "VBZ", "F": "is"}, {"L": "not", "pos": "RB", "F": "n't"}], "weve": [{"F": "we"}, {"L": "have", "pos": "VB", "F": "ve"}], "Sen.": [{"F": "Sen."}], "Whove": [{"F": "Who"}, {"L": "have", "pos": "VB", "F": "ve"}], "Youd": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "d"}], "3a.m.": [{"F": "3"}, {"F": "a.m."}], "Where've": [{"F": "Where"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Shouldn't": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "n't"}], "whats": [{"F": "what"}, {"F": "s"}], "Cannot": [{"L": "can", "pos": "MD", "F": "Can"}, {"L": "not", "pos": "RB", "F": "not"}], "You'd've": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "What'll": [{"F": "What"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Thats": [{"F": "That"}, {"F": "s"}], "o_O": [{"F": "o_O"}], "Whats": [{"F": "What"}, {"F": "s"}], "What're": [{"F": "What"}, {"F": "'re"}], "xDD": [{"F": "xDD"}], "3pm": [{"F": "3"}, {"L": "p.m.", "F": "pm"}], "Who're": [{"F": "Who"}, {"F": "'re"}], "mustve": [{"F": "must"}, {"L": "have", "pos": "VB", "F": "ve"}], ":-/": [{"F": ":-/"}], "Apr.": [{"F": "Apr."}], "ima": [{"L": "-PRON-", "F": "i"}, {"F": "ma"}], "Whens": [{"F": "When"}, {"F": "s"}], "Kan.": [{"F": "Kan."}], "w.": [{"F": "w."}], "3p.m.": [{"F": "3"}, {"F": "p.m."}], "Whyre": [{"F": "Why"}, {"F": "re"}], "-_-": [{"F": "-_-"}], "12pm": [{"F": "12"}, {"L": "p.m.", "F": "pm"}], "Ltd.": [{"F": "Ltd."}], "wasnt": [{"F": "was"}, {"L": "not", "pos": "RB", "F": "nt"}], "Shedve": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Nebr.": [{"F": "Nebr."}], "o.": [{"F": "o."}], ";D": [{"F": ";D"}], "whys": [{"F": "why"}, {"F": "s"}], "Tenn.": [{"F": "Tenn."}], "She'd": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "'d"}], "Needn't": [{"F": "Need"}, {"L": "not", "pos": "RB", "F": "n't"}], "Hadnt": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "nt"}], "m.": [{"F": "m."}], "arent": [{"L": "be", "pos": "VBP", "F": "are", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "Arent": [{"L": "be", "pos": "VBP", "F": "Are", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "<33": [{"F": "<33"}], " ": [{"pos": "SP", "F": " "}], "you've": [{"L": "-PRON-", "F": "you"}, {"L": "have", "pos": "VB", "F": "'ve"}], "mightn't've": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Aug.": [{"F": "Aug."}], "=3": [{"F": "=3"}], "Miss.": [{"F": "Miss."}], "Jul.": [{"F": "Jul."}], "Werent": [{"F": "Were"}, {"L": "not", "pos": "RB", "F": "nt"}], "You'd": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "'d"}], "How's": [{"F": "How"}, {"F": "'s"}], "2a.m.": [{"F": "2"}, {"F": "a.m."}], "youre": [{"L": "-PRON-", "F": "you"}, {"F": "re"}], "hadn't've": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "12p.m.": [{"F": "12"}, {"F": "p.m."}], "Im": [{"L": "-PRON-", "F": "I"}, {"L": "be", "F": "m", "pos": "VBP", "tenspect": 1, "number": 1}], "not've": [{"L": "not", "pos": "RB", "F": "not"}, {"L": "have", "pos": "VB", "F": "'ve"}], "thats": [{"F": "that"}, {"F": "s"}], "Mustnt": [{"F": "Must"}, {"L": "not", "pos": "RB", "F": "nt"}], "what're": [{"F": "what"}, {"F": "'re"}], "How'll": [{"F": "How"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Conn.": [{"F": "Conn."}], "it's": [{"L": "-PRON-", "F": "it"}, {"F": "'s"}], "Can't": [{"L": "can", "pos": "MD", "F": "Ca"}, {"L": "not", "pos": "RB", "F": "n't"}], "'ol": [{"F": "'ol"}], "Mustve": [{"F": "Must"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldn't": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "n't"}], "Okla.": [{"F": "Okla."}], "what'll": [{"F": "what"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Whys": [{"F": "Why"}, {"F": "s"}], "it'll": [{"L": "-PRON-", "F": "it"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Mt.": [{"L": "Mount", "F": "Mt."}], "Itdve": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "couldve": [{"pos": "MD", "F": "could"}, {"L": "have", "pos": "VB", "F": "ve"}], "wholl": [{"F": "who"}, {"L": "will", "pos": "MD", "F": "ll"}], "I've": [{"L": "-PRON-", "F": "I"}, {"L": "have", "pos": "VB", "F": "'ve"}], "thered": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "d"}], "Theyll": [{"L": "-PRON-", "F": "They"}, {"L": "will", "pos": "MD", "F": "ll"}], "Neb.": [{"F": "Neb."}], "Who'll": [{"F": "Who"}, {"L": "will", "pos": "MD", "F": "'ll"}], "cannot": [{"L": "can", "pos": "MD", "F": "can"}, {"L": "not", "pos": "RB", "F": "not"}], ":(": [{"F": ":("}], "xD": [{"F": "xD"}], "10pm": [{"F": "10"}, {"L": "p.m.", "F": "pm"}], "couldnt": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "nt"}], "Would've": [{"F": "Would"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mightn't": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "n't"}], "5p.m.": [{"F": "5"}, {"F": "p.m."}], "youve": [{"L": "-PRON-", "F": "you"}, {"L": "have", "pos": "VB", "F": "ve"}], ":Y": [{"F": ":Y"}], "shedve": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "why's": [{"F": "why"}, {"F": "'s"}], "could've": [{"pos": "MD", "F": "could"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Neednt": [{"F": "Need"}, {"L": "not", "pos": "RB", "F": "nt"}], "vs.": [{"F": "vs."}], "Mont.": [{"F": "Mont."}], "Adm.": [{"F": "Adm."}], "Md.": [{"F": "Md."}], "That's": [{"F": "That"}, {"F": "'s"}], "Mar.": [{"F": "Mar."}], "they'll": [{"L": "-PRON-", "F": "they"}, {"L": "will", "pos": "MD", "F": "'ll"}], "b.": [{"F": "b."}], "Sep.": [{"F": "Sep."}], "whod": [{"F": "who"}, {"L": "would", "pos": "MD", "F": "d"}], "2pm": [{"F": "2"}, {"L": "p.m.", "F": "pm"}], "whyll": [{"F": "why"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadnt": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "nt"}], "There'd've": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "He'd": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "'d"}], "theyre": [{"L": "-PRON-", "F": "they"}, {"F": "re"}], "Ms.": [{"F": "Ms."}], "there'd've": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "5a.m.": [{"F": "5"}, {"F": "a.m."}], "7am": [{"F": "7"}, {"L": "a.m.", "F": "am"}], "they'd've": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mich.": [{"F": "Mich."}], "cant": [{"L": "can", "pos": "MD", "F": "ca"}, {"L": "not", "pos": "RB", "F": "nt"}], "Va.": [{"F": "Va."}], "11pm": [{"F": "11"}, {"L": "p.m.", "F": "pm"}], "youll": [{"L": "-PRON-", "F": "you"}, {"L": "will", "pos": "MD", "F": "ll"}], "Isn't": [{"L": "be", "pos": "VBZ", "F": "Is"}, {"L": "not", "pos": "RB", "F": "n't"}], "i'd've": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Hadn't": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "n't"}], "why'll": [{"F": "why"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Jr.": [{"F": "Jr."}], "whove": [{"F": "who"}, {"L": "have", "pos": "VB", "F": "ve"}], "we'd've": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Youve": [{"L": "-PRON-", "F": "You"}, {"L": "have", "pos": "VB", "F": "ve"}], "He'll": [{"L": "-PRON-", "F": "He"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Wedve": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "9am": [{"F": "9"}, {"L": "a.m.", "F": "am"}], "Hed": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "d"}], "whatve": [{"F": "what"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ore.": [{"F": "Ore."}], "(:": [{"F": "(:"}], "Shouldnt": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "nt"}], "Wash.": [{"F": "Wash."}], "Weve": [{"F": "We"}, {"L": "have", "pos": "VB", "F": "ve"}], "N.J.": [{"F": "N.J."}], "Shouldntve": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "h.": [{"F": "h."}], "we'll": [{"F": "we"}, {"L": "will", "pos": "MD", "F": "'ll"}], "we've": [{"F": "we"}, {"L": "have", "pos": "VB", "F": "'ve"}], "doesnt": [{"L": "do", "pos": "VBZ", "F": "does"}, {"L": "not", "pos": "RB", "F": "nt"}], "who's": [{"F": "who"}, {"F": "'s"}], "he'd": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "'d"}], "Ain't": [{"L": "be", "pos": "VBP", "F": "Ai", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "I'd": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "'d"}], "theredve": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "She'll": [{"L": "-PRON-", "F": "She"}, {"L": "will", "pos": "MD", "F": "'ll"}], "They'd": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "'d"}], "\")": [{"F": "\")"}], "Couldve": [{"pos": "MD", "F": "Could"}, {"L": "have", "pos": "VB", "F": "ve"}], "Whyll": [{"F": "Why"}, {"L": "will", "pos": "MD", "F": "ll"}], "y.": [{"F": "y."}], "12a.m.": [{"F": "12"}, {"F": "a.m."}], "wouldnt": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "nt"}], "<3": [{"F": "<3"}], "\n": [{"pos": "SP", "F": "\n"}], "Whered": [{"F": "Where"}, {"L": "would", "pos": "MD", "F": "d"}], "I'm": [{"L": "-PRON-", "F": "I"}, {"L": "be", "F": "'m", "pos": "VBP", "tenspect": 1, "number": 1}], "Couldntve": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ive": [{"L": "-PRON-", "F": "I"}, {"L": "have", "pos": "VB", "F": "ve"}], "i'd": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "'d"}], "youd": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "d"}], "There'd": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "'d"}], "He's": [{"L": "-PRON-", "F": "He"}, {"F": "'s"}], "Mightntve": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "When's": [{"F": "When"}, {"F": "'s"}], "doesn't": [{"L": "do", "pos": "VBZ", "F": "does"}, {"L": "not", "pos": "RB", "F": "n't"}], "=[[": [{"F": "=[["}], "Youre": [{"L": "-PRON-", "F": "You"}, {"F": "re"}], "=]": [{"F": "=]"}], "You'll": [{"L": "-PRON-", "F": "You"}, {"L": "will", "pos": "MD", "F": "'ll"}], "=)": [{"F": "=)"}], "Pa.": [{"F": "Pa."}], "he'd've": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "You've": [{"L": "-PRON-", "F": "You"}, {"L": "have", "pos": "VB", "F": "'ve"}], "They'll": [{"L": "-PRON-", "F": "They"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Ky.": [{"F": "Ky."}], "c.": [{"F": "c."}], "I.E.": [{"F": "I.E."}], "V_V": [{"F": "V_V"}], "Didn't": [{"L": "do", "pos": "VBD", "F": "Did"}, {"L": "not", "pos": "RB", "F": "n't"}], "What've": [{"F": "What"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Weren't": [{"F": "Were"}, {"L": "not", "pos": "RB", "F": "n't"}], ":]": [{"F": ":]"}], "Notve": [{"L": "not", "pos": "RB", "F": "Not"}, {"L": "have", "pos": "VB", "F": "ve"}], "9a.m.": [{"F": "9"}, {"F": "a.m."}], "7pm": [{"F": "7"}, {"L": "p.m.", "F": "pm"}], "Sept.": [{"F": "Sept."}], "Bros.": [{"F": "Bros."}], "Howd": [{"F": "How"}, {"L": "would", "pos": "MD", "F": "d"}], "weren't": [{"F": "were"}, {"L": "not", "pos": "RB", "F": "n't"}], "Why's": [{"F": "Why"}, {"F": "'s"}], ":((": [{"F": ":(("}], "theyve": [{"L": "-PRON-", "F": "they"}, {"L": "have", "pos": "VB", "F": "ve"}], "where's": [{"F": "where"}, {"F": "'s"}], "ive": [{"L": "-PRON-", "F": "i"}, {"L": "have", "pos": "VB", "F": "ve"}], "=D": [{"F": "=D"}], "what've": [{"F": "what"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Whos": [{"F": "Who"}, {"F": "s"}], ":O": [{"F": ":O"}], "Shouldn't've": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "whatre": [{"F": "what"}, {"F": "re"}], "Wouldn't've": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "aren't": [{"L": "be", "pos": "VBP", "F": "are", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], ":)": [{"F": ":)"}], "They'd've": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}]}
\ No newline at end of file
+{
+  "d.": [
+    {
+      "F": "d."
+    }
+  ], 
+  "Theydve": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  ":/": [
+    {
+      "F": ":/"
+    }
+  ], 
+  "shouldn't've": [
+    {
+      "F": "should"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "10a.m.": [
+    {
+      "F": "10"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "E.G.": [
+    {
+      "F": "E.G."
+    }
+  ], 
+  "howll": [
+    {
+      "F": "how"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "6a.m.": [
+    {
+      "F": "6"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "Ore.": [
+    {
+      "F": "Ore."
+    }
+  ], 
+  "Hadn't've": [
+    {
+      "F": "Had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  ":>": [
+    {
+      "F": ":>"
+    }
+  ], 
+  "3p.m.": [
+    {
+      "F": "3"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "who'll": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "5a.m.": [
+    {
+      "F": "5"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  ":(": [
+    {
+      "F": ":("
+    }
+  ], 
+  ":0": [
+    {
+      "F": ":0"
+    }
+  ], 
+  ":)": [
+    {
+      "F": ":)"
+    }
+  ], 
+  "aint": [
+    {
+      "F": "ai", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  " ": [
+    {
+      "pos": "SP", 
+      "F": " "
+    }
+  ], 
+  "Dec.": [
+    {
+      "F": "Dec."
+    }
+  ], 
+  "Shouldnt": [
+    {
+      "F": "Should"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Ky.": [
+    {
+      "F": "Ky."
+    }
+  ], 
+  "when's": [
+    {
+      "F": "when"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Didnt": [
+    {
+      "F": "Did", 
+      "L": "do", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "itll": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Who're": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "=D": [
+    {
+      "F": "=D"
+    }
+  ], 
+  "Ain't": [
+    {
+      "F": "Ai", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Can't": [
+    {
+      "F": "Ca", 
+      "L": "can", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Whyre": [
+    {
+      "F": "Why"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "Aren't": [
+    {
+      "F": "Are", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Neednt": [
+    {
+      "F": "Need"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "should've": [
+    {
+      "F": "should"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "shouldn't": [
+    {
+      "F": "should"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Idve": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "weve": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Va.": [
+    {
+      "F": "Va."
+    }
+  ], 
+  "D.C.": [
+    {
+      "F": "D.C."
+    }
+  ], 
+  "3am": [
+    {
+      "F": "3"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "Ive": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Md.": [
+    {
+      "F": "Md."
+    }
+  ], 
+  ";D": [
+    {
+      "F": ";D"
+    }
+  ], 
+  "Mrs.": [
+    {
+      "F": "Mrs."
+    }
+  ], 
+  "Minn.": [
+    {
+      "F": "Minn."
+    }
+  ], 
+  "they'd": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Youdve": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "theyve": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Weren't": [
+    {
+      "F": "Were"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "werent": [
+    {
+      "F": "were"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "whyre": [
+    {
+      "F": "why"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "g.": [
+    {
+      "F": "g."
+    }
+  ], 
+  "I'm": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "pos": "VBP", 
+      "F": "'m", 
+      "tenspect": 1, 
+      "number": 1, 
+      "L": "be"
+    }
+  ], 
+  ":p": [
+    {
+      "F": ":p"
+    }
+  ], 
+  "She'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "She"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "not've": [
+    {
+      "F": "not", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "we'll": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  ":O": [
+    {
+      "F": ":O"
+    }
+  ], 
+  "<33": [
+    {
+      "F": "<33"
+    }
+  ], 
+  "Don't": [
+    {
+      "L": "do", 
+      "F": "Do"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Whyll": [
+    {
+      "F": "Why"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "''": [
+    {
+      "F": "''"
+    }
+  ], 
+  "they've": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "t.": [
+    {
+      "F": "t."
+    }
+  ], 
+  "wasn't": [
+    {
+      "F": "was"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "could've": [
+    {
+      "pos": "MD", 
+      "F": "could"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "what've": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "havent": [
+    {
+      "pos": "VB", 
+      "F": "have"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Who've": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  ":P": [
+    {
+      "F": ":P"
+    }
+  ], 
+  "Shan't": [
+    {
+      "F": "Sha"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "i'll": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "i.e.": [
+    {
+      "F": "i.e."
+    }
+  ], 
+  "you'd": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "w.": [
+    {
+      "F": "w."
+    }
+  ], 
+  "whens": [
+    {
+      "F": "when"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "whys": [
+    {
+      "F": "why"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "6pm": [
+    {
+      "F": "6"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "4p.m.": [
+    {
+      "F": "4"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "Whereve": [
+    {
+      "F": "Where"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "o_o": [
+    {
+      "F": "o_o"
+    }
+  ], 
+  "Mo.": [
+    {
+      "F": "Mo."
+    }
+  ], 
+  "Kan.": [
+    {
+      "F": "Kan."
+    }
+  ], 
+  "there'd": [
+    {
+      "F": "there"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "N.H.": [
+    {
+      "F": "N.H."
+    }
+  ], 
+  "(^_^)": [
+    {
+      "F": "(^_^)"
+    }
+  ], 
+  "Mont.": [
+    {
+      "F": "Mont."
+    }
+  ], 
+  "hadn't've": [
+    {
+      "F": "had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "whatll": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "wouldn't've": [
+    {
+      "F": "would"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "there's": [
+    {
+      "F": "there"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Who'll": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "o_O": [
+    {
+      "F": "o_O"
+    }
+  ], 
+  "Nev.": [
+    {
+      "F": "Nev."
+    }
+  ], 
+  "youll": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "wouldve": [
+    {
+      "F": "would"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Nov.": [
+    {
+      "F": "Nov."
+    }
+  ], 
+  "z.": [
+    {
+      "F": "z."
+    }
+  ], 
+  "xDD": [
+    {
+      "F": "xDD"
+    }
+  ], 
+  "Sen.": [
+    {
+      "F": "Sen."
+    }
+  ], 
+  "Wouldnt": [
+    {
+      "F": "Would"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Thered": [
+    {
+      "F": "There"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Youre": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "Couldn't've": [
+    {
+      "pos": "MD", 
+      "F": "Could"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "who're": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "Whys": [
+    {
+      "F": "Why"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "mightn't've": [
+    {
+      "F": "might"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Wholl": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "hadn't": [
+    {
+      "F": "had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Havent": [
+    {
+      "pos": "VB", 
+      "F": "Have"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Whatve": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "2pm": [
+    {
+      "F": "2"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "o.O": [
+    {
+      "F": "o.O"
+    }
+  ], 
+  "Thats": [
+    {
+      "F": "That"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "Gov.": [
+    {
+      "F": "Gov."
+    }
+  ], 
+  "Howll": [
+    {
+      "F": "How"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "p.": [
+    {
+      "F": "p."
+    }
+  ], 
+  "wouldn't": [
+    {
+      "F": "would"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "9pm": [
+    {
+      "F": "9"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "You'll": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Ala.": [
+    {
+      "F": "Ala."
+    }
+  ], 
+  "12am": [
+    {
+      "F": "12"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "=]": [
+    {
+      "F": "=]"
+    }
+  ], 
+  "Cant": [
+    {
+      "F": "Ca", 
+      "L": "can", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "i'd": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "a.m.": [
+    {
+      "F": "a.m."
+    }
+  ], 
+  "weren't": [
+    {
+      "F": "were"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "would've": [
+    {
+      "F": "would"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "i'm": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "pos": "VBP", 
+      "F": "'m", 
+      "tenspect": 1, 
+      "number": 1, 
+      "L": "be"
+    }
+  ], 
+  "why'll": [
+    {
+      "F": "why"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "we'd've": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Shouldve": [
+    {
+      "F": "Should"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "can't": [
+    {
+      "F": "ca", 
+      "L": "can", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "thats": [
+    {
+      "F": "that"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "1p.m.": [
+    {
+      "F": "1"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "12a.m.": [
+    {
+      "F": "12"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "Hes": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "Needn't": [
+    {
+      "F": "Need"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "It's": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "St.": [
+    {
+      "F": "St."
+    }
+  ], 
+  "Why're": [
+    {
+      "F": "Why"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  ":(((": [
+    {
+      "F": ":((("
+    }
+  ], 
+  "Hed": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Mt.": [
+    {
+      "L": "Mount", 
+      "F": "Mt."
+    }
+  ], 
+  "couldn't": [
+    {
+      "pos": "MD", 
+      "F": "could"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "What've": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "4a.m.": [
+    {
+      "F": "4"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "Ind.": [
+    {
+      "F": "Ind."
+    }
+  ], 
+  "It'd": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "<3": [
+    {
+      "F": "<3"
+    }
+  ], 
+  "theydve": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "aren't": [
+    {
+      "F": "are", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Mightn't": [
+    {
+      "F": "Might"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "'S": [
+    {
+      "L": "'s", 
+      "F": "'S"
+    }
+  ], 
+  "I've": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Whered": [
+    {
+      "F": "Where"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Itdve": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "I'ma": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "'ma"
+    }
+  ], 
+  "whos": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "They'd": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "What'll": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  ":Y": [
+    {
+      "F": ":Y"
+    }
+  ], 
+  "You've": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Mustve": [
+    {
+      "F": "Must"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "whod": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "mightntve": [
+    {
+      "F": "might"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "I'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Must've": [
+    {
+      "F": "Must"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "it'd": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Ark.": [
+    {
+      "F": "Ark."
+    }
+  ], 
+  "Wis.": [
+    {
+      "F": "Wis."
+    }
+  ], 
+  "6p.m.": [
+    {
+      "F": "6"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "what're": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "N.C.": [
+    {
+      "F": "N.C."
+    }
+  ], 
+  "Wasn't": [
+    {
+      "F": "Was"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "what's": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "he'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Jan.": [
+    {
+      "F": "Jan."
+    }
+  ], 
+  "She'd": [
+    {
+      "L": "-PRON-", 
+      "F": "She"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "shedve": [
+    {
+      "L": "-PRON-", 
+      "F": "she"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Tenn.": [
+    {
+      "F": "Tenn."
+    }
+  ], 
+  "ain't": [
+    {
+      "F": "ai", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "She's": [
+    {
+      "L": "-PRON-", 
+      "F": "She"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "i'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "2a.m.": [
+    {
+      "F": "2"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "We'd've": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "must've": [
+    {
+      "F": "must"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "That's": [
+    {
+      "F": "That"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Sept.": [
+    {
+      "F": "Sept."
+    }
+  ], 
+  "whatre": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "you'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Dont": [
+    {
+      "L": "do", 
+      "F": "Do"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "i.": [
+    {
+      "F": "i."
+    }
+  ], 
+  "Jun.": [
+    {
+      "F": "Jun."
+    }
+  ], 
+  "thered": [
+    {
+      "F": "there"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Youd": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "couldn't've": [
+    {
+      "pos": "MD", 
+      "F": "could"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Whens": [
+    {
+      "F": "When"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "8a.m.": [
+    {
+      "F": "8"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "Isnt": [
+    {
+      "F": "Is", 
+      "L": "be", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "mightve": [
+    {
+      "F": "might"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "'ol": [
+    {
+      "F": "'ol"
+    }
+  ], 
+  "2p.m.": [
+    {
+      "F": "2"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "9a.m.": [
+    {
+      "F": "9"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "q.": [
+    {
+      "F": "q."
+    }
+  ], 
+  "didnt": [
+    {
+      "F": "did", 
+      "L": "do", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "ive": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "It'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "e.g.": [
+    {
+      "F": "e.g."
+    }
+  ], 
+  ":]": [
+    {
+      "F": ":]"
+    }
+  ], 
+  "\t": [
+    {
+      "pos": "SP", 
+      "F": "\t"
+    }
+  ], 
+  "Mich.": [
+    {
+      "F": "Mich."
+    }
+  ], 
+  "Itll": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "didn't": [
+    {
+      "F": "did", 
+      "L": "do", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "3pm": [
+    {
+      "F": "3"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "Jul.": [
+    {
+      "F": "Jul."
+    }
+  ], 
+  "7pm": [
+    {
+      "F": "7"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "cant": [
+    {
+      "F": "ca", 
+      "L": "can", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Miss.": [
+    {
+      "F": "Miss."
+    }
+  ], 
+  "im": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "pos": "VBP", 
+      "F": "m", 
+      "tenspect": 1, 
+      "number": 1, 
+      "L": "be"
+    }
+  ], 
+  "Ariz.": [
+    {
+      "F": "Ariz."
+    }
+  ], 
+  "they'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "f.": [
+    {
+      "F": "f."
+    }
+  ], 
+  "Co.": [
+    {
+      "F": "Co."
+    }
+  ], 
+  "Hadntve": [
+    {
+      "F": "Had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Weve": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "1a.m.": [
+    {
+      "F": "1"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "=3": [
+    {
+      "F": "=3"
+    }
+  ], 
+  "Mightnt": [
+    {
+      "F": "Might"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "1pm": [
+    {
+      "F": "1"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "youdve": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Shedve": [
+    {
+      "L": "-PRON-", 
+      "F": "She"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "theyd": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Ill.": [
+    {
+      "F": "Ill."
+    }
+  ], 
+  "N.D.": [
+    {
+      "F": "N.D."
+    }
+  ], 
+  "Cannot": [
+    {
+      "F": "Can", 
+      "L": "can", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "not", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "s.": [
+    {
+      "F": "s."
+    }
+  ], 
+  "Hadn't": [
+    {
+      "F": "Had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "What're": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "He'll": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "wholl": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "They're": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "Neb.": [
+    {
+      "F": "Neb."
+    }
+  ], 
+  "shouldnt": [
+    {
+      "F": "should"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "\n": [
+    {
+      "pos": "SP", 
+      "F": "\n"
+    }
+  ], 
+  "whered": [
+    {
+      "F": "where"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "7a.m.": [
+    {
+      "F": "7"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "youve": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "4am": [
+    {
+      "F": "4"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "v.": [
+    {
+      "F": "v."
+    }
+  ], 
+  "notve": [
+    {
+      "F": "not", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "couldve": [
+    {
+      "pos": "MD", 
+      "F": "could"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "mustve": [
+    {
+      "F": "must"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Youve": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "might've": [
+    {
+      "F": "might"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Mustn't": [
+    {
+      "F": "Must"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "wheres": [
+    {
+      "F": "where"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "they're": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "idve": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "hows": [
+    {
+      "F": "how"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "Fla.": [
+    {
+      "F": "Fla."
+    }
+  ], 
+  "N.M.": [
+    {
+      "F": "N.M."
+    }
+  ], 
+  "youre": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "Didn't": [
+    {
+      "F": "Did", 
+      "L": "do", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Couldve": [
+    {
+      "pos": "MD", 
+      "F": "Could"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "10p.m.": [
+    {
+      "F": "10"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "Del.": [
+    {
+      "F": "Del."
+    }
+  ], 
+  "Oct.": [
+    {
+      "F": "Oct."
+    }
+  ], 
+  "Rep.": [
+    {
+      "F": "Rep."
+    }
+  ], 
+  "cannot": [
+    {
+      "F": "can", 
+      "L": "can", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "not", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Im": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "pos": "VBP", 
+      "F": "m", 
+      "tenspect": 1, 
+      "number": 1, 
+      "L": "be"
+    }
+  ], 
+  "howd": [
+    {
+      "F": "how"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Okla.": [
+    {
+      "F": "Okla."
+    }
+  ], 
+  "Feb.": [
+    {
+      "F": "Feb."
+    }
+  ], 
+  "you've": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "You're": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "she'll": [
+    {
+      "L": "-PRON-", 
+      "F": "she"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Theyll": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "don't": [
+    {
+      "L": "do", 
+      "F": "do"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "itd": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  ":-)": [
+    {
+      "F": ":-)"
+    }
+  ], 
+  "Hedve": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "isnt": [
+    {
+      "F": "is", 
+      "L": "be", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "won't": [
+    {
+      "F": "wo"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "We're": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "^_^": [
+    {
+      "F": "^_^"
+    }
+  ], 
+  "I.e.": [
+    {
+      "F": "I.e."
+    }
+  ], 
+  "9p.m.": [
+    {
+      "F": "9"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "dont": [
+    {
+      "L": "do", 
+      "F": "do"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "ima": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "ma"
+    }
+  ], 
+  "he's": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "we've": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "What's": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Who's": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "-__-": [
+    {
+      "F": "-__-"
+    }
+  ], 
+  "hedve": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "he'd": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "When's": [
+    {
+      "F": "When"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Mightn't've": [
+    {
+      "F": "Might"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "We've": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Wash.": [
+    {
+      "F": "Wash."
+    }
+  ], 
+  "Couldntve": [
+    {
+      "pos": "MD", 
+      "F": "Could"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Who'd": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  ":-/": [
+    {
+      "F": ":-/"
+    }
+  ], 
+  "haven't": [
+    {
+      "pos": "VB", 
+      "F": "have"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Gen.": [
+    {
+      "F": "Gen."
+    }
+  ], 
+  "(:": [
+    {
+      "F": "(:"
+    }
+  ], 
+  "arent": [
+    {
+      "F": "are", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "You'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "c.": [
+    {
+      "F": "c."
+    }
+  ], 
+  "(=": [
+    {
+      "F": "(="
+    }
+  ], 
+  "Wouldn't": [
+    {
+      "F": "Would"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "who's": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "12p.m.": [
+    {
+      "F": "12"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "5am": [
+    {
+      "F": "5"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "Mightve": [
+    {
+      "F": "Might"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  ":((": [
+    {
+      "F": ":(("
+    }
+  ], 
+  "theredve": [
+    {
+      "F": "there"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Messrs.": [
+    {
+      "F": "Messrs."
+    }
+  ], 
+  "who'd": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Where's": [
+    {
+      "F": "Where"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "wont": [
+    {
+      "F": "wo"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "she'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "she"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "10pm": [
+    {
+      "F": "10"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "Corp.": [
+    {
+      "F": "Corp."
+    }
+  ], 
+  "Aug.": [
+    {
+      "F": "Aug."
+    }
+  ], 
+  "-_-": [
+    {
+      "F": "-_-"
+    }
+  ], 
+  "y.": [
+    {
+      "F": "y."
+    }
+  ], 
+  "Should've": [
+    {
+      "F": "Should"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "11pm": [
+    {
+      "F": "11"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "8am": [
+    {
+      "F": "8"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "theyre": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "l.": [
+    {
+      "F": "l."
+    }
+  ], 
+  "Wouldntve": [
+    {
+      "F": "Would"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Ga.": [
+    {
+      "F": "Ga."
+    }
+  ], 
+  "1am": [
+    {
+      "F": "1"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "Where've": [
+    {
+      "F": "Where"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "11a.m.": [
+    {
+      "F": "11"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "mustn't": [
+    {
+      "F": "must"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "isn't": [
+    {
+      "F": "is", 
+      "L": "be", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Bros.": [
+    {
+      "F": "Bros."
+    }
+  ], 
+  "Aint": [
+    {
+      "F": "Ai", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "why's": [
+    {
+      "F": "why"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "V_V": [
+    {
+      "F": "V_V"
+    }
+  ], 
+  ";p": [
+    {
+      "F": ";p"
+    }
+  ], 
+  "There'd": [
+    {
+      "F": "There"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "They'll": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "=)": [
+    {
+      "F": "=)"
+    }
+  ], 
+  "b.": [
+    {
+      "F": "b."
+    }
+  ], 
+  "how'll": [
+    {
+      "F": "how"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Wedve": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "couldntve": [
+    {
+      "pos": "MD", 
+      "F": "could"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "12pm": [
+    {
+      "F": "12"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "There's": [
+    {
+      "F": "There"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "we'd": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Dr.": [
+    {
+      "F": "Dr."
+    }
+  ], 
+  "Whod": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  ":-P": [
+    {
+      "F": ":-P"
+    }
+  ], 
+  "whatve": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Wouldve": [
+    {
+      "F": "Would"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "o.": [
+    {
+      "F": "o."
+    }
+  ], 
+  ":')": [
+    {
+      "F": ":')"
+    }
+  ], 
+  "needn't": [
+    {
+      "F": "need"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "shouldntve": [
+    {
+      "F": "should"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "why're": [
+    {
+      "F": "why"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "p.m.": [
+    {
+      "F": "p.m."
+    }
+  ], 
+  "Doesnt": [
+    {
+      "F": "Does", 
+      "L": "do", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "whereve": [
+    {
+      "F": "where"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "they'll": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "I'd": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Might've": [
+    {
+      "F": "Might"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "mightnt": [
+    {
+      "F": "might"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Kans.": [
+    {
+      "F": "Kans."
+    }
+  ], 
+  "Not've": [
+    {
+      "F": "Not", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "e.": [
+    {
+      "F": "e."
+    }
+  ], 
+  "mightn't": [
+    {
+      "F": "might"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "you're": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "Mar.": [
+    {
+      "F": "Mar."
+    }
+  ], 
+  "They've": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "\")": [
+    {
+      "F": "\")"
+    }
+  ], 
+  "what'll": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Calif.": [
+    {
+      "F": "Calif."
+    }
+  ], 
+  "Could've": [
+    {
+      "pos": "MD", 
+      "F": "Could"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Would've": [
+    {
+      "F": "Would"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  ";)": [
+    {
+      "F": ";)"
+    }
+  ], 
+  ";(": [
+    {
+      "F": ";("
+    }
+  ], 
+  "Isn't": [
+    {
+      "F": "Is", 
+      "L": "be", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "let's": [
+    {
+      "F": "let"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "'em": [
+    {
+      "F": "'em"
+    }
+  ], 
+  "She'll": [
+    {
+      "L": "-PRON-", 
+      "F": "She"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "I.E.": [
+    {
+      "F": "I.E."
+    }
+  ], 
+  "You'd": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "wouldnt": [
+    {
+      "F": "would"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "6am": [
+    {
+      "F": "6"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "11am": [
+    {
+      "F": "11"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "Why'll": [
+    {
+      "F": "Why"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Where'd": [
+    {
+      "F": "Where"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Theyre": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "11p.m.": [
+    {
+      "F": "11"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "Won't": [
+    {
+      "F": "Wo"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Couldn't": [
+    {
+      "pos": "MD", 
+      "F": "Could"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "it's": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "r.": [
+    {
+      "F": "r."
+    }
+  ], 
+  "it'll": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "They'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Ima": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "ma"
+    }
+  ], 
+  "5pm": [
+    {
+      "F": "5"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "10am": [
+    {
+      "F": "10"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "m.": [
+    {
+      "F": "m."
+    }
+  ], 
+  "whats": [
+    {
+      "F": "what"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "How's": [
+    {
+      "F": "How"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Sep.": [
+    {
+      "F": "Sep."
+    }
+  ], 
+  "Shouldntve": [
+    {
+      "F": "Should"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "youd": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Whatll": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Wouldn't've": [
+    {
+      "F": "Would"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "How'd": [
+    {
+      "F": "How"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "doesnt": [
+    {
+      "F": "does", 
+      "L": "do", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "h.": [
+    {
+      "F": "h."
+    }
+  ], 
+  "Shouldn't": [
+    {
+      "F": "Should"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "He'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Mightntve": [
+    {
+      "F": "Might"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "couldnt": [
+    {
+      "pos": "MD", 
+      "F": "could"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Haven't": [
+    {
+      "pos": "VB", 
+      "F": "Have"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "<333": [
+    {
+      "F": "<333"
+    }
+  ], 
+  "doesn't": [
+    {
+      "F": "does", 
+      "L": "do", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Hasn't": [
+    {
+      "F": "Has"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "how's": [
+    {
+      "F": "how"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "hes": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "=[[": [
+    {
+      "F": "=[["
+    }
+  ], 
+  "xD": [
+    {
+      "F": "xD"
+    }
+  ], 
+  "he'll": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "hed": [
+    {
+      "L": "-PRON-", 
+      "F": "he"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "7p.m.": [
+    {
+      "F": "7"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "how'd": [
+    {
+      "F": "how"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "u.": [
+    {
+      "F": "u."
+    }
+  ], 
+  "we're": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "'re"
+    }
+  ], 
+  "vs.": [
+    {
+      "F": "vs."
+    }
+  ], 
+  "Hadnt": [
+    {
+      "F": "Had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Shant": [
+    {
+      "F": "Sha"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Theyve": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Hows": [
+    {
+      "F": "How"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "We'll": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "N.Y.": [
+    {
+      "F": "N.Y."
+    }
+  ], 
+  "x.": [
+    {
+      "F": "x."
+    }
+  ], 
+  "8p.m.": [
+    {
+      "F": "8"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "i've": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Whove": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "2am": [
+    {
+      "F": "2"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "La.": [
+    {
+      "F": "La."
+    }
+  ], 
+  "i'ma": [
+    {
+      "L": "-PRON-", 
+      "F": "i"
+    }, 
+    {
+      "F": "'ma"
+    }
+  ], 
+  "N.J.": [
+    {
+      "F": "N.J."
+    }
+  ], 
+  "Nebr.": [
+    {
+      "F": "Nebr."
+    }
+  ], 
+  "Howd": [
+    {
+      "F": "How"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "hadnt": [
+    {
+      "F": "had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "shant": [
+    {
+      "F": "sha"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "There'd've": [
+    {
+      "F": "There"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Inc.": [
+    {
+      "F": "Inc."
+    }
+  ], 
+  "I'll": [
+    {
+      "L": "-PRON-", 
+      "F": "I"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Why's": [
+    {
+      "F": "Why"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Adm.": [
+    {
+      "F": "Adm."
+    }
+  ], 
+  "Shouldn't've": [
+    {
+      "F": "Should"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "n.": [
+    {
+      "F": "n."
+    }
+  ], 
+  "Wasnt": [
+    {
+      "F": "Was"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "whove": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  ";-p": [
+    {
+      "F": ";-p"
+    }
+  ], 
+  "hasn't": [
+    {
+      "F": "has"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "wouldntve": [
+    {
+      "F": "would"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Wheres": [
+    {
+      "F": "Where"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "How'll": [
+    {
+      "F": "How"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "there'd've": [
+    {
+      "F": "there"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Whos": [
+    {
+      "F": "Who"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "shes": [
+    {
+      "L": "-PRON-", 
+      "F": "she"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "Doesn't": [
+    {
+      "F": "Does", 
+      "L": "do", 
+      "pos": "VBZ"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Arent": [
+    {
+      "F": "Are", 
+      "pos": "VBP", 
+      "number": 2, 
+      "L": "be"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Hasnt": [
+    {
+      "F": "Has"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "j.": [
+    {
+      "F": "j."
+    }
+  ], 
+  "He's": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "wasnt": [
+    {
+      "F": "was"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "whyll": [
+    {
+      "F": "why"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "co.": [
+    {
+      "F": "co."
+    }
+  ], 
+  "mustnt": [
+    {
+      "F": "must"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "He'd": [
+    {
+      "L": "-PRON-", 
+      "F": "He"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "3a.m.": [
+    {
+      "F": "3"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
+  "Shes": [
+    {
+      "L": "-PRON-", 
+      "F": "She"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "where've": [
+    {
+      "F": "where"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Youll": [
+    {
+      "L": "-PRON-", 
+      "F": "You"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Apr.": [
+    {
+      "F": "Apr."
+    }
+  ], 
+  "Conn.": [
+    {
+      "F": "Conn."
+    }
+  ], 
+  "8pm": [
+    {
+      "F": "8"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  "9am": [
+    {
+      "F": "9"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "hasnt": [
+    {
+      "F": "has"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "theyll": [
+    {
+      "L": "-PRON-", 
+      "F": "they"
+    }, 
+    {
+      "F": "ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "it'd've": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "itdve": [
+    {
+      "L": "-PRON-", 
+      "F": "it"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Jr.": [
+    {
+      "F": "Jr."
+    }
+  ], 
+  "Rev.": [
+    {
+      "F": "Rev."
+    }
+  ], 
+  "k.": [
+    {
+      "F": "k."
+    }
+  ], 
+  "wedve": [
+    {
+      "F": "we"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Let's": [
+    {
+      "F": "Let"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Colo.": [
+    {
+      "F": "Colo."
+    }
+  ], 
+  "Mr.": [
+    {
+      "F": "Mr."
+    }
+  ], 
+  "Werent": [
+    {
+      "F": "Were"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Theredve": [
+    {
+      "F": "There"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "shan't": [
+    {
+      "F": "sha"
+    }, 
+    {
+      "F": "n't", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  ";-)": [
+    {
+      "F": ";-)"
+    }
+  ], 
+  "Wont": [
+    {
+      "F": "Wo"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "hadntve": [
+    {
+      "F": "had", 
+      "L": "have", 
+      "pos": "VBD"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "who've": [
+    {
+      "F": "who"
+    }, 
+    {
+      "F": "'ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "Whatre": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "re"
+    }
+  ], 
+  "'s": [
+    {
+      "L": "'s", 
+      "F": "'s"
+    }
+  ], 
+  "where'd": [
+    {
+      "F": "where"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "shouldve": [
+    {
+      "F": "should"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "a.": [
+    {
+      "F": "a."
+    }
+  ], 
+  "where's": [
+    {
+      "F": "where"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Ltd.": [
+    {
+      "F": "Ltd."
+    }
+  ], 
+  "Mass.": [
+    {
+      "F": "Mass."
+    }
+  ], 
+  "neednt": [
+    {
+      "F": "need"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Pa.": [
+    {
+      "F": "Pa."
+    }
+  ], 
+  "It'll": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "7am": [
+    {
+      "F": "7"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
+    }
+  ], 
+  "We'd": [
+    {
+      "F": "We"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Whats": [
+    {
+      "F": "What"
+    }, 
+    {
+      "F": "s"
+    }
+  ], 
+  "\u2014": [
+    {
+      "pos": ":", 
+      "L": "--", 
+      "F": "\u2014"
+    }
+  ], 
+  "E.g.": [
+    {
+      "F": "E.g."
+    }
+  ], 
+  "Ms.": [
+    {
+      "F": "Ms."
+    }
+  ], 
+  ":3": [
+    {
+      "F": ":3"
+    }
+  ], 
+  "5p.m.": [
+    {
+      "F": "5"
+    }, 
+    {
+      "F": "p.m."
+    }
+  ], 
+  "Itd": [
+    {
+      "L": "-PRON-", 
+      "F": "It"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "May.": [
+    {
+      "F": "May."
+    }
+  ], 
+  "she'd": [
+    {
+      "L": "-PRON-", 
+      "F": "she"
+    }, 
+    {
+      "F": "'d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "Mustnt": [
+    {
+      "F": "Must"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "Notve": [
+    {
+      "F": "Not", 
+      "L": "not", 
+      "pos": "RB"
+    }, 
+    {
+      "F": "ve", 
+      "L": "have", 
+      "pos": "VB"
+    }
+  ], 
+  "you'll": [
+    {
+      "L": "-PRON-", 
+      "F": "you"
+    }, 
+    {
+      "F": "'ll", 
+      "L": "will", 
+      "pos": "MD"
+    }
+  ], 
+  "Theyd": [
+    {
+      "L": "-PRON-", 
+      "F": "They"
+    }, 
+    {
+      "F": "d", 
+      "L": "would", 
+      "pos": "MD"
+    }
+  ], 
+  "she's": [
+    {
+      "L": "-PRON-", 
+      "F": "she"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "Couldnt": [
+    {
+      "pos": "MD", 
+      "F": "Could"
+    }, 
+    {
+      "F": "nt", 
+      "L": "not", 
+      "pos": "RB"
+    }
+  ], 
+  "that's": [
+    {
+      "F": "that"
+    }, 
+    {
+      "F": "'s"
+    }
+  ], 
+  "4pm": [
+    {
+      "F": "4"
+    }, 
+    {
+      "L": "p.m.", 
+      "F": "pm"
+    }
+  ], 
+  ":))": [
+    {
+      "F": ":))"
+    }
+  ]
+}
\ No newline at end of file

From caff4638c986411c2c7b6de0230fd53fa862cad3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 11:08:12 +0200
Subject: [PATCH 42/62] * Fix website/test_api.py for Python 3

---
 tests/website/test_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/website/test_api.py b/tests/website/test_api.py
index ef0365d88..c173c2b74 100644
--- a/tests/website/test_api.py
+++ b/tests/website/test_api.py
@@ -80,7 +80,7 @@ def test_read_bytes(nlp):
         file_.write(nlp(u'This is a document.').to_bytes())
         file_.write(nlp(u'This is another.').to_bytes())
     docs = []
-    with open(loc) as file_:
+    with open(loc, 'rb') as file_:
         for byte_string in Doc.read_bytes(file_):
             docs.append(Doc(nlp.vocab).from_bytes(byte_string))
     assert len(docs) == 2

From 1f90502ce8fec29786ae7c65de2e0e5391bdd931 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 11:08:31 +0200
Subject: [PATCH 43/62] * Fix website/test_home for Python 3

---
 tests/website/test_home.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/website/test_home.py b/tests/website/test_home.py
index 6c97b0f31..4da61becf 100644
--- a/tests/website/test_home.py
+++ b/tests/website/test_home.py
@@ -17,7 +17,7 @@ def test_load_resources_and_process_text():
 @pytest.mark.models
 def test_get_tokens_and_sentences(doc):
     token = doc[0]
-    sentence = doc.sents.next()
+    sentence = next(doc.sents)
     assert token is sentence[0]
     assert sentence.text == 'Hello, world.'
 

From f35632e2e584ca6979dd8efac8868602c056b04c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 11:08:58 +0200
Subject: [PATCH 44/62] * Remove SBD print statement in train, after SBD
 evaluation was removed from Scorer

---
 bin/parser/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bin/parser/train.py b/bin/parser/train.py
index f2e153c29..c1f81af33 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -229,7 +229,6 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
     print('POS', scorer.tags_acc)
     print('UAS', scorer.uas)
     print('LAS', scorer.las)
-    print('SBD', scorer.sbd_acc)
 
     print('NER P', scorer.ents_p)
     print('NER R', scorer.ents_r)

From 5682439d1e8780802d426aa5ac0e7da104fa521c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 20:24:21 +1100
Subject: [PATCH 45/62] * Remove em dash test from test_lemmatizer, as em
 dashes are now handled in specials.json

---
 tests/tagger/test_lemmatizer.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py
index df553c3d6..2dec62c4a 100644
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@@ -41,8 +41,3 @@ def test_smart_quotes(lemmatizer):
     do = lemmatizer.punct
     assert do('“') == set(['``'])
     assert do('“') == set(['``'])
-
-
-def test_smart_quotes(lemmatizer):
-    do = lemmatizer.punct
-    assert do('–') == set(["--"])

From 599f739ddb43a73c1466144734ea706bdf1bf6bb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 20:51:28 +1100
Subject: [PATCH 46/62] * Fix smart quote lemma test

---
 tests/tagger/test_lemmatizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py
index 2dec62c4a..ff10b6573 100644
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@@ -39,5 +39,5 @@ def test_noun_lemmas(lemmatizer):
 
 def test_smart_quotes(lemmatizer):
     do = lemmatizer.punct
-    assert do('“') == set(['``'])
-    assert do('“') == set(['``'])
+    assert do('“') == set(['"'])
+    assert do('“') == set(['"'])

From 3bf50ab83036eec076e4df07ff56a3d7730f81c4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 20:57:47 +1100
Subject: [PATCH 47/62] * Ensure the fabfile prebuild command installs pytest

---
 fabfile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fabfile.py b/fabfile.py
index 953c02e00..b7ef6f18f 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -47,7 +47,7 @@ def prebuild(build_dir='/tmp/build_spacy'):
         local('git clone %s .' % spacy_dir)
         local('virtualenv ' + build_venv)
         with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)):
-            local('pip install cython fabric fabtools')
+            local('pip install cython fabric fabtools pytest')
             local('pip install -r requirements.txt')
             local('fab clean make')
             local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir)

From af8d0a2a0901bb8faf77f5a74209fa102f261995 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 12:42:41 +0200
Subject: [PATCH 48/62] * Increment version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0c05d890b..e386925b6 100644
--- a/setup.py
+++ b/setup.py
@@ -134,7 +134,7 @@ def run_setup(exts):
     headers_workaround.install_headers('numpy')
 
 
-VERSION = '0.93'
+VERSION = '0.94'
 def main(modules, is_pypy):
     language = "cpp"
     includes = ['.', path.join(sys.prefix, 'include')]

From a3dfe2b90128fcbb549400c390f27ca01fede09b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Fri, 9 Oct 2015 13:26:17 +0200
Subject: [PATCH 49/62] * Increment data version

---
 spacy/en/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/en/download.py b/spacy/en/download.py
index 20e7b5b95..01c87a4e4 100644
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@@ -7,7 +7,7 @@ import wget
 import plac
 
 # TODO: Read this from the same source as the setup
-VERSION = '0.9.0'
+VERSION = '0.9.1'
 
 AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com'
 

From 876fc99c44f674c61ec4a43e4cd5173a6ea2e3d3 Mon Sep 17 00:00:00 2001
From: Henning Peters <pete@dexterslab.de>
Date: Fri, 9 Oct 2015 16:11:56 +0200
Subject: [PATCH 50/62] cleanup

looks like this file was accidentally added
---
 website/src/jade/home/_installation.jade~ | 83 -----------------------
 1 file changed, 83 deletions(-)
 delete mode 100644 website/src/jade/home/_installation.jade~

diff --git a/website/src/jade/home/_installation.jade~ b/website/src/jade/home/_installation.jade~
deleted file mode 100644
index 9b6b4fa3f..000000000
--- a/website/src/jade/home/_installation.jade~
+++ /dev/null
@@ -1,83 +0,0 @@
-mixin Option(name, open)
-  details(open=open)
-    summary
-      h4= name
-    block
-
-article.post
-    header
-        h2 #[a(href=Meta.url)
-        
-        p What's new in v0.90?
-
-        .subhead by #[a(href="//twitter.com/spacy_io", rel="author" target="_blank") #{spaCy}] on #[time #{getDate(Meta.date).fulldate}]
-
-    ul
-        li Support for gazetteers
-        li Set Lexeme attributes
-    #[a.readmore(href=Meta.url) Full Change Log ►]
-
-
-section.intro
-    p What's
-
-+Option("conda", true)
-  pre.language-bash: code
-    | $ conda install spacy
-    | $ python -m spacy.en.download
-
-+Option("pip and virtualenv", true)
-  p With Python 2.7 or Python 3, using Linux or OSX, run:
-
-    pre.language-bash: code
-      | $ pip install spacy
-      | $ python -m spacy.en.download
-
-  p
-    | The download command fetches and installs about 300mb of data, for
-    | the parser model and word vectors, which it installs within the spacy.en
-    | package directory.
-
-
-  +Option("Workaround for obsolete system Python", false)
-    p
-      | If you're stuck using a server with an old version of Python, and you
-      | don't have root access, I've prepared a bootstrap script to help you
-      | compile a local Python install.  Run:
-
-    pre.language-bash: code
-      | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
-
-
-
-+Option("Compile from source", false)
-  p
-    | The other way to install the package is to clone the github repository,
-    | and build it from source.  This installs an additional dependency,
-    | Cython.  If you're using Python 2, I also recommend installing fabric
-    | and fabtools &ndash; this is how I build the project.
-
-  pre.language-bash: code
-    | $ git clone https://github.com/honnibal/spaCy.git
-    | $ cd spaCy
-    | $ virtualenv .env && source .env/bin/activate
-    | $ export PYTHONPATH=`pwd`
-    | $ pip install -r requirements.txt
-    | $ python setup.py build_ext --inplace
-    | $ python -m spacy.en.download
-    | $ pip install pytest
-    | $ py.test tests/
-
-  p
-    | Python packaging is awkward at the best of times, and it's particularly tricky
-    | with C extensions, built via Cython, requiring large data files.  So,
-    | please report issues as you encounter them.
-
-+Option("pypy (Unsupported)")
-  | If PyPy support is a priority for you, please get in touch.  We could likely
-  | fix the remaining issues, if necessary.  However, the library is likely to
-  | be much slower on PyPy, as it's written in Cython, which produces code tuned
-  | for the performance of CPython.
-
-+Option("Windows (Unsupported)")
-  | Unfortunately we don't currently support Windows.

From 88b2f7ea5d51a57d3644ce2508b9d9c26913aead Mon Sep 17 00:00:00 2001
From: Henning Peters <pete@dexterslab.de>
Date: Fri, 9 Oct 2015 16:30:23 +0200
Subject: [PATCH 51/62] push version and add spacy channel

---
 website/src/jade/home/_installation.jade | 5 +++++
 website/src/jade/home/index.jade         | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/website/src/jade/home/_installation.jade b/website/src/jade/home/_installation.jade
index 7a9a14bd5..c0e0b1445 100644
--- a/website/src/jade/home/_installation.jade
+++ b/website/src/jade/home/_installation.jade
@@ -20,6 +20,11 @@ mixin Option(name, open)
     | $ conda install spacy
     | $ python -m spacy.en.download all
 
+  p Latest stable conda packages are available from the spacy channel:
+
+  pre.language-bash: code
+    | $ conda install -c https://conda.anaconda.org/spacy spacy
+
 +Option("pip and virtualenv", true)
     p With Python 2.7 or Python 3, using Linux or OSX, ensure that you have the following packages installed:
 
diff --git a/website/src/jade/home/index.jade b/website/src/jade/home/index.jade
index f95f4fd53..89635b180 100644
--- a/website/src/jade/home/index.jade
+++ b/website/src/jade/home/index.jade
@@ -29,7 +29,7 @@ include ../header.jade
         li: a.button(href="#example-use") Examples
         li: a.button(href="#install")
           | Install
-          <span class="button-caption">v0.93</span>
+          <span class="button-caption">v0.94</span>
 
   article.page.landing-page
     +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")

From 7a47c0c872f3886c08d3abcb8dd92ef654019817 Mon Sep 17 00:00:00 2001
From: Henning Peters <pete@dexterslab.de>
Date: Fri, 9 Oct 2015 16:37:57 +0200
Subject: [PATCH 52/62] push version

---
 website/src/jade/home/index.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/src/jade/home/index.jade b/website/src/jade/home/index.jade
index 89635b180..a77dd323c 100644
--- a/website/src/jade/home/index.jade
+++ b/website/src/jade/home/index.jade
@@ -35,4 +35,4 @@ include ../header.jade
     +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
     +Section("Online Demo", "online-demo", "./_online_demo.jade")
     +Section("Usage by Example", "example-use", "./_usage_examples.jade")
-    +Section("Install v0.93", "install", "./_installation.jade")
+    +Section("Install v0.94", "install", "./_installation.jade")

From 7e7f28e1fd57ea3c0877d14cbf8f11b1dc397296 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 14:06:09 +1100
Subject: [PATCH 53/62] * Add smart-quote possessive marker in
 generate_specials

---
 lang_data/en/generate_specials.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index db3827593..9ebd94a52 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -115,6 +115,8 @@ hardcoded_specials = {
                 "'s":  [{"F": "'s", "L": "'s"}],
 
                 "'S":  [{"F": "'S", "L": "'s"}],
+                u"\u2018s": [{"F": u"\u2018s", "L": "'s"}],
+                u"\u2018S": [{"F": u"\u2018S", "L": "'s"}],
 
                 "'em": [{"F": "'em"}],
 

From 57b3cd466163d747efde130e18f343f3c7314597 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 14:06:46 +1100
Subject: [PATCH 54/62] * Add smart-quotes to lemma rules

---
 lang_data/en/lemma_rules.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index 1d7366f92..30a19be50 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -31,6 +31,8 @@
 
     "punct": [
         ["“", "\""],
-        ["”", "\""]
+        ["”", "\""],
+        ['\u2018', "'"],
+        ['\u2019', "'"]
     ]
 }

From 30de4135c98b092c953fd2c16a6e47fce2d17fc6 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 14:22:32 +1100
Subject: [PATCH 55/62] * Fix merge problem

---
 spacy/strings.pyx | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index a247fa6a8..4b47f5a82 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,9 +1,5 @@
-<<<<<<< HEAD
 from __future__ import unicode_literals
-import codecs
-=======
 import io
->>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd
 
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
@@ -133,25 +129,15 @@ cdef class StringStore:
 
     def dump(self, loc):
         cdef Utf8Str* string
-<<<<<<< HEAD
         cdef unicode py_string
         cdef int i
-        with codecs.open(loc, 'w', 'utf8') as file_:
+        with io.open(loc, 'w', 'utf8') as file_:
             for i in range(1, self.size):
                 string = &self.c[i]
                 py_string = _decode(string)
                 file_.write(py_string)
                 if (i+1) != self.size:
                     file_.write(SEPARATOR)
-=======
-        cdef bytes py_string
-        for i in range(self.size):
-            string = &self.strings[i]
-            py_string = string.chars[:string.length]
-            strings.append(py_string.decode('utf8'))
-        with io.open(loc, 'w', encoding='utf8') as file_:
-            file_.write(SEPARATOR.join(strings))
->>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd
 
     def load(self, loc):
         with io.open(loc, 'r', encoding='utf8') as file_:

From 2153067958de3062abcae7f0b41dd54ec89a09f9 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:03:12 +1100
Subject: [PATCH 56/62] * Fix use of io in strings.pyx

---
 spacy/strings.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 4b47f5a82..29a8a47a8 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -131,7 +131,7 @@ cdef class StringStore:
         cdef Utf8Str* string
         cdef unicode py_string
         cdef int i
-        with io.open(loc, 'w', 'utf8') as file_:
+        with io.open(loc, 'w', encoding='utf8') as file_:
             for i in range(1, self.size):
                 string = &self.c[i]
                 py_string = _decode(string)

From c12d36d5f4694996b01b959bc16092f9848bde92 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:03:36 +1100
Subject: [PATCH 57/62] * Fix quote marks in lemma_rules

---
 lang_data/en/lemma_rules.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json
index 30a19be50..1e76436cd 100644
--- a/lang_data/en/lemma_rules.json
+++ b/lang_data/en/lemma_rules.json
@@ -32,7 +32,7 @@
     "punct": [
         ["“", "\""],
         ["”", "\""],
-        ['\u2018', "'"],
-        ['\u2019', "'"]
+        ["\u2018", "'"],
+        ["\u2019", "'"]
     ]
 }

From 1521cf25c95273b09a602101c6d2d31496c64d2d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:04:01 +1100
Subject: [PATCH 58/62] * Fix merge problem in test_parse_navigate

---
 tests/parser/test_parse_navigate.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/parser/test_parse_navigate.py b/tests/parser/test_parse_navigate.py
index 8c76199f4..1771dbeba 100644
--- a/tests/parser/test_parse_navigate.py
+++ b/tests/parser/test_parse_navigate.py
@@ -7,11 +7,7 @@ import pytest
 
 @pytest.fixture
 def sun_text():
-<<<<<<< HEAD:tests/parser/test_parse_navigate.py
-    with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_:
-=======
     with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_:
->>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd:tests/test_parse_navigate.py
         text = file_.read()
     return text
 

From 8b39feefbed39ef66aae08bc6cf1ecd6d402dd2e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:32:13 +1100
Subject: [PATCH 59/62] * Add dependency post-process rule to ensure spaces are
 attached to neighbouring tokens, so that they can't be sentence boundaries

---
 spacy/syntax/arc_eager.pyx | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 265018920..07595d4ab 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -9,7 +9,8 @@ from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
-from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
+from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
+from ..lexeme cimport Lexeme
 
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
@@ -380,7 +381,10 @@ cdef class ArcEager(TransitionSystem):
 
     cdef int finalize_state(self, StateClass st) nogil:
         for i in range(st.length):
-            if st._sent[i].head == 0 and st._sent[i].dep == 0:
+            # Always attach spaces to the previous word
+            if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
+                st._sent[i].head = -1 if (i >= 1) else 1
+            elif st._sent[i].head == 0 and st._sent[i].dep == 0:
                 st._sent[i].dep = self.root_label
             # If we're not using the Break transition, we segment via root-labelled
             # arcs between the root words.

From 9dd2f25c7438c81f7122f9de28f4d35e1e6b0911 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:53:30 +1100
Subject: [PATCH 60/62] * Fix Issue #131: Force whitespace characters to attach
 syntactically to previous token, and ensure they cannot serve as stand-alone
 'sentence' units.

---
 spacy/syntax/arc_eager.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 07595d4ab..561308928 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -380,10 +380,17 @@ cdef class ArcEager(TransitionSystem):
         st.fast_forward()
 
     cdef int finalize_state(self, StateClass st) nogil:
+        cdef int i
         for i in range(st.length):
             # Always attach spaces to the previous word
             if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
                 st._sent[i].head = -1 if (i >= 1) else 1
+                if st._sent[i].sent_start and st._sent[i].head == -1:
+                    st._sent[i].sent_start = False
+                    # If we had this space token as the start of a sentence,
+                    # move that sentence start forward one
+                    if (i + 1) < st.length and not st._sent[i+1].sent_start:
+                        st._sent[i+1].sent_start = True
             elif st._sent[i].head == 0 and st._sent[i].dep == 0:
                 st._sent[i].dep = self.root_label
             # If we're not using the Break transition, we segment via root-labelled

From bdcb8d695c7d012726501f87da9a38faca269024 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:54:06 +1100
Subject: [PATCH 61/62] * Add non-breaking space to specials.json

---
 lang_data/en/generate_specials.py |  3 +-
 lang_data/en/specials.json        | 67 ++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py
index 9ebd94a52..7c642c7c4 100644
--- a/lang_data/en/generate_specials.py
+++ b/lang_data/en/generate_specials.py
@@ -341,7 +341,8 @@ hardcoded_specials = {
                 "E.G.": [{"F": "E.G."}],
                 "\n": [{"F": "\n", "pos": "SP"}],
                 "\t": [{"F": "\t", "pos": "SP"}],
-                " ": [{"F": " ", "pos": "SP"}]
+                " ": [{"F": " ", "pos": "SP"}],
+                u"\xa0": [{"F": u"\xa0", "pos": "SP", "L": "  "}]
 
 }
 
diff --git a/lang_data/en/specials.json b/lang_data/en/specials.json
index 20d90e261..4cb44bb74 100644
--- a/lang_data/en/specials.json
+++ b/lang_data/en/specials.json
@@ -605,9 +605,13 @@
       "pos": "VB"
     }
   ], 
-  ":P": [
+  "11am": [
     {
-      "F": ":P"
+      "F": "11"
+    }, 
+    {
+      "L": "a.m.", 
+      "F": "am"
     }
   ], 
   "Shan't": [
@@ -710,6 +714,13 @@
       "F": "Kan."
     }
   ], 
+  "\u00a0": [
+    {
+      "pos": "SP", 
+      "L": "  ", 
+      "F": "\u00a0"
+    }
+  ], 
   "there'd": [
     {
       "F": "there"
@@ -1624,6 +1635,11 @@
       "pos": "RB"
     }
   ], 
+  "Wash.": [
+    {
+      "F": "Wash."
+    }
+  ], 
   "She's": [
     {
       "L": "-PRON-", 
@@ -1885,11 +1901,6 @@
       "F": "e.g."
     }
   ], 
-  ":]": [
-    {
-      "F": ":]"
-    }
-  ], 
   "\t": [
     {
       "pos": "SP", 
@@ -2581,14 +2592,23 @@
       "F": "'re"
     }
   ], 
+  "3a.m.": [
+    {
+      "F": "3"
+    }, 
+    {
+      "F": "a.m."
+    }
+  ], 
   "^_^": [
     {
       "F": "^_^"
     }
   ], 
-  "I.e.": [
+  "\u2018S": [
     {
-      "F": "I.e."
+      "L": "'s", 
+      "F": "\u2018S"
     }
   ], 
   "9p.m.": [
@@ -2719,9 +2739,10 @@
       "pos": "VB"
     }
   ], 
-  "Wash.": [
+  "\u2018s": [
     {
-      "F": "Wash."
+      "L": "'s", 
+      "F": "\u2018s"
     }
   ], 
   "Couldntve": [
@@ -3249,9 +3270,9 @@
       "F": "o."
     }
   ], 
-  ":')": [
+  ":]": [
     {
-      "F": ":')"
+      "F": ":]"
     }
   ], 
   "needn't": [
@@ -3535,13 +3556,9 @@
       "F": "am"
     }
   ], 
-  "11am": [
+  ":P": [
     {
-      "F": "11"
-    }, 
-    {
-      "L": "a.m.", 
-      "F": "am"
+      "F": ":P"
     }
   ], 
   "Why'll": [
@@ -4363,12 +4380,9 @@
       "pos": "MD"
     }
   ], 
-  "3a.m.": [
+  "I.e.": [
     {
-      "F": "3"
-    }, 
-    {
-      "F": "a.m."
+      "F": "I.e."
     }
   ], 
   "Shes": [
@@ -4406,6 +4420,11 @@
       "F": "Apr."
     }
   ], 
+  ":')": [
+    {
+      "F": ":')"
+    }
+  ], 
   "Conn.": [
     {
       "F": "Conn."

From dfbcff2ff1b992c34ebbead140060adb3839d1d3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:54:55 +1100
Subject: [PATCH 62/62] * Revert codecs/io change to strings.pyx, as it seemed
 to cause an error? Will investigate.

---
 spacy/strings.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 29a8a47a8..a4a470158 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,5 +1,5 @@
 from __future__ import unicode_literals
-import io
+import codecs
 
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash64
@@ -131,7 +131,7 @@ cdef class StringStore:
         cdef Utf8Str* string
         cdef unicode py_string
         cdef int i
-        with io.open(loc, 'w', encoding='utf8') as file_:
+        with codecs.open(loc, 'w', 'utf8') as file_:
             for i in range(1, self.size):
                 string = &self.c[i]
                 py_string = _decode(string)
@@ -140,7 +140,7 @@ cdef class StringStore:
                     file_.write(SEPARATOR)
 
     def load(self, loc):
-        with io.open(loc, 'r', encoding='utf8') as file_:
+        with codecs.open(loc, 'r', 'utf8') as file_:
             strings = file_.read().split(SEPARATOR)
         if strings == ['']:
             return None