From 8199012d26b12caf0a3791676e213c5a29966be0 Mon Sep 17 00:00:00 2001 From: alvations Date: Wed, 30 Sep 2015 20:10:15 +0200 Subject: [PATCH 01/62] changing deprecated codecs.open to io.open =) --- spacy/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 1d48ab7e9..34a660c4c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,5 +1,5 @@ from os import path -import codecs +import io import json import re @@ -7,7 +7,7 @@ DATA_DIR = path.join(path.dirname(__file__), '..', 'data') def utf8open(loc, mode='r'): - return codecs.open(loc, mode, 'utf8') + return io.open(loc, mode, encoding='utf8') def read_lang_data(data_dir): From 764bdc62e7f4e91ef571d6b655da8e53b7839447 Mon Sep 17 00:00:00 2001 From: alvations Date: Wed, 30 Sep 2015 20:16:52 +0200 Subject: [PATCH 02/62] caught another codecs.open --- bin/parser/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 267b26275..57889511d 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -5,7 +5,7 @@ from __future__ import unicode_literals import os from os import path import shutil -import codecs +import io import random import plac @@ -169,7 +169,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc): nlp = Language() gold_tuples = read_docparse_file(dev_loc) scorer = Scorer() - out_file = codecs.open(out_loc, 'w', 'utf8') + out_file = io.open(out_loc, 'w', encoding='utf8') for raw_text, segmented_text, annot_tuples in gold_tuples: tokens = nlp(raw_text) for t in tokens: From 8caedba42a5255b9996533a732e17eee3f20a2dd Mon Sep 17 00:00:00 2001 From: alvations Date: Wed, 30 Sep 2015 20:20:09 +0200 Subject: [PATCH 03/62] caught more codecs.open -> io.open --- bin/init_model.py | 6 +++--- bin/ner_tag.py | 4 ++-- bin/prepare_treebank.py | 4 ++-- spacy/en/lemmatizer.py | 6 +++--- spacy/gold.pyx | 2 +- spacy/strings.pyx | 6 +++--- spacy/vocab.pyx | 2 +- tests/test_parse_navigate.py | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index a75bd9827..ba99808f0 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -20,7 +20,7 @@ from pathlib import Path from shutil import copyfile from shutil import copytree -import codecs +import io from spacy.en import get_lex_props from spacy.vocab import Vocab @@ -41,7 +41,7 @@ def setup_tokenizer(lang_data_dir, tok_dir): def _read_clusters(loc): clusters = {} - for line in codecs.open(str(loc), 'r', 'utf8'): + for line in io.open(str(loc), 'r', encoding='utf8'): try: cluster, word, freq = line.split() except ValueError: @@ -65,7 +65,7 @@ def _read_clusters(loc): def _read_probs(loc): probs = {} - for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): + for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')): prob, word = line.split() prob = float(prob) probs[word] = prob diff --git a/bin/ner_tag.py b/bin/ner_tag.py index 34588bd12..f990f21a1 100644 --- a/bin/ner_tag.py +++ b/bin/ner_tag.py @@ -1,11 +1,11 @@ -import codecs +import io import plac from spacy.en import English def main(text_loc): - with codecs.open(text_loc, 'r', 'utf8') as file_: + with io.open(text_loc, 'r', encoding='utf8') as file_: text = file_.read() NLU = English() for paragraph in text.split('\n\n'): diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py index d13ef7130..f9f4eec21 100644 --- a/bin/prepare_treebank.py +++ b/bin/prepare_treebank.py @@ -27,7 +27,7 @@ import json from os import path import os import re -import codecs +import io from collections import defaultdict from spacy.munge import read_ptb @@ -122,7 +122,7 @@ def read_file(*pieces): if not path.exists(loc): return None else: - return codecs.open(loc, 'r', 'utf8').read().strip() + return io.open(loc, 'r', encoding='utf8').read().strip() def get_file_names(section_dir, subsection): diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py index 5883e12c8..a9625f0e9 100644 --- a/spacy/en/lemmatizer.py +++ b/spacy/en/lemmatizer.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals from os import path -import codecs +import io NOUN_RULES = ( @@ -85,7 +85,7 @@ def lemmatize(string, index, exceptions, rules): def read_index(loc): index = set() - for line in codecs.open(loc, 'r', 'utf8'): + for line in io.open(loc, 'r', encoding='utf8'): if line.startswith(' '): continue pieces = line.split() @@ -97,7 +97,7 @@ def read_index(loc): def read_exc(loc): exceptions = {} - for line in codecs.open(loc, 'r', 'utf8'): + for line in io.open(loc, 'r', encoding='utf8'): if line.startswith(' '): continue pieces = line.split() diff --git a/spacy/gold.pyx b/spacy/gold.pyx index cab4ba8a1..4fe5c6b52 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,5 +1,5 @@ import numpy -import codecs +import io import json import ujson import random diff --git a/spacy/strings.pyx b/spacy/strings.pyx index e15f88837..8cf735bb6 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,4 +1,4 @@ -import codecs +import io from libc.string cimport memcpy from murmurhash.mrmr cimport hash64 @@ -112,11 +112,11 @@ cdef class StringStore: string = &self.strings[i] py_string = string.chars[:string.length] strings.append(py_string.decode('utf8')) - with codecs.open(loc, 'w', 'utf8') as file_: + with io.open(loc, 'w', encoding='utf8') as file_: file_.write(SEPARATOR.join(strings)) def load(self, loc): - with codecs.open(loc, 'r', 'utf8') as file_: + with io.open(loc, 'r', encoding='utf8') as file_: strings = file_.read().split(SEPARATOR) cdef unicode string cdef bytes byte_string diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index c93e4202f..475b06dd1 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -4,7 +4,7 @@ from libc.stdint cimport int32_t import bz2 from os import path -import codecs +import io import math from .lexeme cimport EMPTY_LEXEME diff --git a/tests/test_parse_navigate.py b/tests/test_parse_navigate.py index cf6971c89..1fff0f684 100644 --- a/tests/test_parse_navigate.py +++ b/tests/test_parse_navigate.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals from os import path -import codecs +import io from spacy.en import English @@ -9,7 +9,7 @@ import pytest @pytest.fixture def sun_text(): - with codecs.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', 'utf8') as file_: + with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_: text = file_.read() return text From 73566899bf3bde655a9437af601fe5744f700a66 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 00:51:25 -0700 Subject: [PATCH 04/62] Add Doc slicing tests --- tests/tokens/test_tokens_api.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index e1238373f..a7311932f 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -12,6 +12,15 @@ def test_getitem(EN): with pytest.raises(IndexError): tokens[len(tokens)] + span = tokens[1:1] + assert not '/'.join(token.orth_ for token in span) + span = tokens[1:4] + assert '/'.join(token.orth_ for token in span) == 'it/back/!' + with pytest.raises(ValueError): + tokens[1:4:2] + with pytest.raises(ValueError): + tokens[1:4:-1] + @pytest.mark.models def test_serialize(EN): From 2fc33e8024487974c6fbc6941026b75f8e89a07b Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 00:56:33 -0700 Subject: [PATCH 05/62] Allow step=1 when slicing a Doc --- spacy/tokens/doc.pyx | 2 +- tests/tokens/test_tokens_api.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 8a7d12555..ce278d868 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -87,7 +87,7 @@ cdef class Doc: token (Token): """ if isinstance(i, slice): - if i.step is not None: + if not (i.step is None or i.step == 1): raise ValueError("Stepped slices not supported in Span objects." "Try: list(doc)[start:stop:step] instead.") if i.start is None: diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index a7311932f..fc1b52143 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -16,6 +16,8 @@ def test_getitem(EN): assert not '/'.join(token.orth_ for token in span) span = tokens[1:4] assert '/'.join(token.orth_ for token in span) == 'it/back/!' + span = tokens[1:4:1] + assert '/'.join(token.orth_ for token in span) == 'it/back/!' with pytest.raises(ValueError): tokens[1:4:2] with pytest.raises(ValueError): From ef2af20cd373583b6d4ee6cc06ce8ca8406fba8c Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 01:59:11 -0700 Subject: [PATCH 06/62] Make Doc's slicing behavior conform to Python conventions --- spacy/tokens/spans.pyx | 8 +++++-- tests/tokens/test_tokens_api.py | 40 ++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index c39f8976c..99efad4b9 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -16,9 +16,13 @@ cdef class Span: def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): if start < 0: - start = tokens.length - start + start = tokens.length + start + start = min(tokens.length, max(0, start)) + if end < 0: - end = tokens.length - end + end = tokens.length + end + end = min(tokens.length, max(start, end)) + self.doc = tokens self.start = start self.end = end diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index fc1b52143..a272a8e3b 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -12,17 +12,51 @@ def test_getitem(EN): with pytest.raises(IndexError): tokens[len(tokens)] + def to_str(span): + return '/'.join(token.orth_ for token in span) + span = tokens[1:1] - assert not '/'.join(token.orth_ for token in span) + assert not to_str(span) span = tokens[1:4] - assert '/'.join(token.orth_ for token in span) == 'it/back/!' + assert to_str(span) == 'it/back/!' span = tokens[1:4:1] - assert '/'.join(token.orth_ for token in span) == 'it/back/!' + assert to_str(span) == 'it/back/!' with pytest.raises(ValueError): tokens[1:4:2] with pytest.raises(ValueError): tokens[1:4:-1] + span = tokens[-3:6] + assert to_str(span) == 'He/pleaded' + span = tokens[4:-1] + assert to_str(span) == 'He/pleaded' + span = tokens[-5:-3] + assert to_str(span) == 'back/!' + span = tokens[5:4] + assert span.start == span.end == 5 and not to_str(span) + span = tokens[4:-3] + assert span.start == span.end == 4 and not to_str(span) + + span = tokens[:] + assert to_str(span) == 'Give/it/back/!/He/pleaded/.' + span = tokens[4:] + assert to_str(span) == 'He/pleaded/.' + span = tokens[:4] + assert to_str(span) == 'Give/it/back/!' + span = tokens[:-3] + assert to_str(span) == 'Give/it/back/!' + span = tokens[-3:] + assert to_str(span) == 'He/pleaded/.' + + span = tokens[4:50] + assert to_str(span) == 'He/pleaded/.' + span = tokens[-50:4] + assert to_str(span) == 'Give/it/back/!' + span = tokens[-50:-40] + assert span.start == span.end == 0 and not to_str(span) + span = tokens[40:50] + assert span.start == span.end == 7 and not to_str(span) + @pytest.mark.models def test_serialize(EN): From 5cc2f2b01ab26e313a7035f998fc1b4373cb6cc5 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 02:08:39 -0700 Subject: [PATCH 07/62] Test simple indexing for Span --- tests/tokens/test_tokens_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index a272a8e3b..34e54a2af 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -57,6 +57,9 @@ def test_getitem(EN): span = tokens[40:50] assert span.start == span.end == 7 and not to_str(span) + span = tokens[1:4] + assert span[0].orth_ == 'it' + @pytest.mark.models def test_serialize(EN): From 97685aecb735289de32c992e3659e503412aeeb5 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 02:45:49 -0700 Subject: [PATCH 08/62] Add slicing support to Span --- spacy/tokens/spans.pyx | 21 ++++++++++++++++++++- tests/tokens/test_tokens_api.py | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 99efad4b9..955d24ad4 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -50,7 +50,26 @@ cdef class Span: return 0 return self.end - self.start - def __getitem__(self, int i): + def __getitem__(self, object i): + if isinstance(i, slice): + start, end, step = i.start, i.stop, i.step + if start is None: + start = 0 + elif start < 0: + start += len(self) + start = min(len(self), max(0, start)) + + if end is None: + end = len(self) + elif end < 0: + end += len(self) + end = min(len(self), max(start, end)) + + start += self.start + end += self.start + + return self.doc[start:end:i.step] + if i < 0: return self.doc[self.end + i] else: diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index 34e54a2af..675f00235 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -59,6 +59,24 @@ def test_getitem(EN): span = tokens[1:4] assert span[0].orth_ == 'it' + subspan = span[:] + assert to_str(subspan) == 'it/back/!' + subspan = span[:2] + assert to_str(subspan) == 'it/back' + subspan = span[1:] + assert to_str(subspan) == 'back/!' + subspan = span[:-1] + assert to_str(subspan) == 'it/back' + subspan = span[-2:] + assert to_str(subspan) == 'back/!' + subspan = span[1:2] + assert to_str(subspan) == 'back' + subspan = span[-2:-1] + assert to_str(subspan) == 'back' + subspan = span[-50:50] + assert to_str(subspan) == 'it/back/!' + subspan = span[50:-50] + assert subspan.start == subspan.end == 4 and not to_str(subspan) @pytest.mark.models From 3fd3bc79aa7fa5c1c1ae360b49b3d2a1da6b0f36 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Wed, 7 Oct 2015 01:25:35 -0700 Subject: [PATCH 09/62] Refactor to remove duplicate slicing logic --- spacy/tokens/doc.pyx | 11 +++-------- spacy/tokens/spans.pyx | 27 +++++---------------------- spacy/util.py | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ce278d868..b78214ba9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,6 +21,7 @@ from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray +from ..util import normalize_slice DEF PADDING = 5 @@ -87,14 +88,8 @@ cdef class Doc: token (Token): """ if isinstance(i, slice): - if not (i.step is None or i.step == 1): - raise ValueError("Stepped slices not supported in Span objects." - "Try: list(doc)[start:stop:step] instead.") - if i.start is None: - i = slice(0, i.stop) - if i.stop is None: - i = slice(i.start, len(self)) - return Span(self, i.start, i.stop, label=0) + start, stop = normalize_slice(len(self), i.start, i.stop, i.step) + return Span(self, start, stop, label=0) if i < 0: i = self.length + i diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 955d24ad4..e8d2f2e59 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -9,19 +9,15 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t +from ..util import normalize_slice cdef class Span: """A slice from a Doc object.""" def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): - if start < 0: - start = tokens.length + start - start = min(tokens.length, max(0, start)) - - if end < 0: - end = tokens.length + end - end = min(tokens.length, max(start, end)) + if not (0 <= start <= end <= len(tokens)): + raise IndexError self.doc = tokens self.start = start @@ -52,23 +48,10 @@ cdef class Span: def __getitem__(self, object i): if isinstance(i, slice): - start, end, step = i.start, i.stop, i.step - if start is None: - start = 0 - elif start < 0: - start += len(self) - start = min(len(self), max(0, start)) - - if end is None: - end = len(self) - elif end < 0: - end += len(self) - end = min(len(self), max(start, end)) - + start, end = normalize_slice(len(self), i.start, i.stop, i.step) start += self.start end += self.start - - return self.doc[start:end:i.step] + return Span(self.doc, start, end) if i < 0: return self.doc[self.end + i] diff --git a/spacy/util.py b/spacy/util.py index 9f5b4fe04..449b06399 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,6 +7,26 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE DATA_DIR = path.join(path.dirname(__file__), '..', 'data') +def normalize_slice(length, start, stop, step=None): + if not (step is None or step == 1): + raise ValueError("Stepped slices not supported in Span objects." + "Try: list(tokens)[start:stop:step] instead.") + if start is None: + start = 0 + elif start < 0: + start += length + start = min(length, max(0, start)) + + if stop is None: + stop = length + elif stop < 0: + stop += length + stop = min(length, max(start, stop)) + + assert 0 <= start <= stop <= length + return start, stop + + def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8') From 0f601b8b750a8991d333a7a95f97b74b80b46846 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Wed, 7 Oct 2015 01:27:28 -0700 Subject: [PATCH 10/62] Update docstring of Doc.__getitem__ --- spacy/tokens/doc.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b78214ba9..eab6c044e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -82,10 +82,10 @@ cdef class Doc: self._vector = None def __getitem__(self, object i): - """Get a token. + """Get a Token or a Span from the Doc. Returns: - token (Token): + token (Token) or span (Span): """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) From 5890682ed1676a5d6d1f27e6a95a740c8faf31f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 8 Oct 2015 13:59:32 +1100 Subject: [PATCH 11/62] * Fix multi_word_matches script --- examples/multi_word_matches.py | 101 +++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 17 deletions(-) diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py index 06cc313a9..59d3c2a63 100644 --- a/examples/multi_word_matches.py +++ b/examples/multi_word_matches.py @@ -22,6 +22,10 @@ our pattern set stays very small (exact size depends on the maximum length we're looking for, as the query language currently has no quantifiers) """ from __future__ import print_function, unicode_literals, division +from ast import literal_eval +from bz2 import BZ2File +import time +import math import plac @@ -30,22 +34,66 @@ from spacy.strings import hash_string from spacy.en import English from spacy.matcher import Matcher -from spacy.attrs import FLAG63 as U_ENT +from spacy.attrs import FLAG63 as B_ENT from spacy.attrs import FLAG62 as L_ENT from spacy.attrs import FLAG61 as I_ENT -from spacy.attrs import FLAG60 as B_ENT + +from spacy.attrs import FLAG60 as B2_ENT +from spacy.attrs import FLAG59 as B3_ENT +from spacy.attrs import FLAG58 as B4_ENT +from spacy.attrs import FLAG57 as B5_ENT +from spacy.attrs import FLAG56 as B6_ENT +from spacy.attrs import FLAG55 as B7_ENT +from spacy.attrs import FLAG54 as B8_ENT +from spacy.attrs import FLAG53 as B9_ENT +from spacy.attrs import FLAG52 as B10_ENT + +from spacy.attrs import FLAG51 as I3_ENT +from spacy.attrs import FLAG50 as I4_ENT +from spacy.attrs import FLAG49 as I5_ENT +from spacy.attrs import FLAG48 as I6_ENT +from spacy.attrs import FLAG47 as I7_ENT +from spacy.attrs import FLAG46 as I8_ENT +from spacy.attrs import FLAG45 as I9_ENT +from spacy.attrs import FLAG44 as I10_ENT + +from spacy.attrs import FLAG43 as L2_ENT +from spacy.attrs import FLAG42 as L3_ENT +from spacy.attrs import FLAG41 as L4_ENT +from spacy.attrs import FLAG40 as L5_ENT +from spacy.attrs import FLAG39 as L6_ENT +from spacy.attrs import FLAG38 as L7_ENT +from spacy.attrs import FLAG37 as L8_ENT +from spacy.attrs import FLAG36 as L9_ENT +from spacy.attrs import FLAG35 as L10_ENT def get_bilou(length): if length == 1: return [U_ENT] - else: - return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT] + elif length == 2: + return [B2_ENT, L2_ENT] + elif length == 3: + return [B3_ENT, I3_ENT, L3_ENT] + elif length == 4: + return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] + elif length == 5: + return [B5_ENT, I5_ENT, I5_ENT, L5_ENT] + elif length == 6: + return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] + elif length == 7: + return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] + elif length == 8: + return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] + elif length == 9: + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] + elif length == 10: + return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, L10_ENT] def make_matcher(vocab, max_length): abstract_patterns = [] - for length in range(1, max_length+1): + for length in range(2, max_length): abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)}) @@ -66,29 +114,48 @@ def merge_matches(doc, matches): doc.merge(start, end, tag, text, 'MWE') -def main(): - nlp = English(parser=False, tagger=False, entity=False) +def read_gazetteer(loc): + for line in open(loc): + phrase = literal_eval('u' + line.strip()) + if ' (' in phrase and phrase.endswith(')'): + phrase = phrase.split(' (', 1)[0] + yield phrase - gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] - example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' +def read_text(bz2_loc): + with BZ2File(bz2_loc) as file_: + for line in file_: + yield line.decode('utf8') + +def main(patterns_loc, text_loc): + nlp = English(parser=False, tagger=False, entity=False) + pattern_ids = PreshMap() - max_length = 0 - for pattern_str in gazetteer: + max_length = 10 + i = 0 + for pattern_str in read_gazetteer(patterns_loc): pattern = nlp.tokenizer(pattern_str) + if len(pattern) < 2 or len(pattern) >= max_length: + continue bilou_tags = get_bilou(len(pattern)) for word, tag in zip(pattern, bilou_tags): lexeme = nlp.vocab[word.orth] lexeme.set_flag(tag, True) pattern_ids[hash_string(pattern.text)] = True - max_length = max(max_length, len(pattern)) + i += 1 + if i >= 10000001: + break matcher = make_matcher(nlp.vocab, max_length) - doc = nlp(example_text) - matches = get_matches(matcher, pattern_ids, doc) - merge_matches(doc, matches) - for token in doc: - print(token.text, token.ent_type_) + t1 = time.time() + + for text in read_text(text_loc): + doc = nlp.tokenizer(text) + matches = get_matches(matcher, pattern_ids, doc) + merge_matches(doc, matches) + t2 = time.time() + print('10 ^ %d patterns took %d s' % (round(math.log(i, 10)), t2-t1)) + if __name__ == '__main__': From 2d68f75b6a3ccca4f4f3cdda257eccc0f3c0e0ea Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 8 Oct 2015 13:59:56 +1100 Subject: [PATCH 12/62] * Fix identity tag map --- lang_data/fi/tag_map.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lang_data/fi/tag_map.json b/lang_data/fi/tag_map.json index 6b21a1e29..4451d0fa0 100644 --- a/lang_data/fi/tag_map.json +++ b/lang_data/fi/tag_map.json @@ -13,5 +13,7 @@ "ADP": {"pos": "ADP"}, "SYM": {"pos": "SYM"}, "X": {"pos": "X"}, - "INTJ": {"pos": "INTJ"} + "INTJ": {"pos": "INTJ"}, + "DET": {"pos": "DET"}, + "PART": {"pos": "PART"} } From e3e8994368322c6263f7ae797732e013e3cd6def Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 8 Oct 2015 14:00:13 +1100 Subject: [PATCH 13/62] * Patch italian tag map --- lang_data/it/tag_map.json | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/lang_data/it/tag_map.json b/lang_data/it/tag_map.json index 514e978a6..92f11e457 100644 --- a/lang_data/it/tag_map.json +++ b/lang_data/it/tag_map.json @@ -2,43 +2,43 @@ "S": {"pos": "NOUN"}, "E": {"pos": "ADP"}, "RD": {"pos": "DET"}, -"V": {"pos": "VER"}, -"_": {"pos": "_"}, +"V": {"pos": "VERB"}, +"_": {"pos": "NO_TAG"}, "A": {"pos": "ADJ"}, -"SP": {"pos": "PROP"}, -"FF": {"pos": "PUNC"}, -"FS": {"pos": "PUNC"}, +"SP": {"pos": "PROPN"}, +"FF": {"pos": "PUNCT"}, +"FS": {"pos": "PUNCT"}, "B": {"pos": "ADV"}, -"CC": {"pos": "CON"}, -"FB": {"pos": "PUNC"}, +"CC": {"pos": "CONJ"}, +"FB": {"pos": "PUNCT"}, "VA": {"pos": "AUX"}, -"PC": {"pos": "PRO"}, +"PC": {"pos": "PRON"}, "N": {"pos": "NUM"}, "RI": {"pos": "DET"}, -"PR": {"pos": "PRO"}, -"CS": {"pos": "SCON"}, +"PR": {"pos": "PRON"}, +"CS": {"pos": "SCONJ"}, "BN": {"pos": "ADV"}, "AP": {"pos": "DET"}, "VM": {"pos": "AUX"}, "DI": {"pos": "DET"}, -"FC": {"pos": "PUNC"}, -"PI": {"pos": "PRO"}, +"FC": {"pos": "PUNCT"}, +"PI": {"pos": "PRON"}, "DD": {"pos": "DET"}, "DQ": {"pos": "DET"}, -"PQ": {"pos": "PRO"}, -"PD": {"pos": "PRO"}, +"PQ": {"pos": "PRON"}, +"PD": {"pos": "PRON"}, "NO": {"pos": "ADJ"}, -"PE": {"pos": "PRO"}, +"PE": {"pos": "PRON"}, "T": {"pos": "DET"}, "X": {"pos": "SYM"}, "SW": {"pos": "X"}, -"NO": {"pos": "PRO"}, -"I": {"pos": "INT"}, +"NO": {"pos": "PRON"}, +"I": {"pos": "INTJ"}, "X": {"pos": "X"}, "DR": {"pos": "DET"}, "EA": {"pos": "ADP"}, -"PP": {"pos": "PRO"}, +"PP": {"pos": "PRON"}, "X": {"pos": "NUM"}, "DE": {"pos": "DET"}, -"X": {"pos": "PAR"} +"X": {"pos": "PART"} } From 4513bed175bf05a0eb0a4365c1bf934d4dde12d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 8 Oct 2015 14:00:34 +1100 Subject: [PATCH 14/62] * Avoid compiling unused files --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e6fbc246a..0c05d890b 100644 --- a/setup.py +++ b/setup.py @@ -156,8 +156,8 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.morphology', 'spacy.tagger', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy._theano', - 'spacy.tokenizer', 'spacy.en.attrs', - 'spacy.en.pos', 'spacy.syntax.parser', + 'spacy.tokenizer', + 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', From b3a70e63754210b13086cd488970d2f2d57d0092 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 8 Oct 2015 14:34:11 +1100 Subject: [PATCH 15/62] * Clean up unnecessary try/except block --- spacy/morphology.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index ddeca62d7..1a499aa0a 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -31,10 +31,7 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int tag_id if isinstance(tag, basestring): - try: - tag_id = self.reverse_index[self.strings[tag]] - except KeyError: - raise + tag_id = self.reverse_index[self.strings[tag]] else: tag_id = tag analysis = self._cache.get(tag_id, token.lex.orth) From 1a71706c05127150e267070c811abffc782e72bb Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Thu, 8 Oct 2015 14:22:23 +0400 Subject: [PATCH 16/62] Fix typo --- website/src/jade/blog/eli5-computers-learn-reading/index.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/jade/blog/eli5-computers-learn-reading/index.jade b/website/src/jade/blog/eli5-computers-learn-reading/index.jade index 45d2d8bdd..4f3e9ebb1 100644 --- a/website/src/jade/blog/eli5-computers-learn-reading/index.jade +++ b/website/src/jade/blog/eli5-computers-learn-reading/index.jade @@ -24,7 +24,7 @@ include ./meta.jade p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses. - p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit. + p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit. p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.) From 801d55a6d950f708a1911e84abff024c772ad466 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 02:00:45 +1100 Subject: [PATCH 17/62] * Fix phrase matcher --- spacy/matcher.pyx | 176 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 144 insertions(+), 32 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 88a4f9ba2..afafd3ddb 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -1,11 +1,18 @@ +# cython: profile=True +from __future__ import unicode_literals + from os import path from .typedefs cimport attr_t +from .typedefs cimport hash_t from .attrs cimport attr_id_t -from .structs cimport TokenC +from .structs cimport TokenC, LexemeC +from .lexeme cimport Lexeme from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap from libcpp.vector cimport vector +from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 @@ -15,6 +22,38 @@ from .vocab cimport Vocab from libcpp.vector cimport vector +from .attrs import FLAG61 as U_ENT + +from .attrs import FLAG60 as B2_ENT +from .attrs import FLAG59 as B3_ENT +from .attrs import FLAG58 as B4_ENT +from .attrs import FLAG57 as B5_ENT +from .attrs import FLAG56 as B6_ENT +from .attrs import FLAG55 as B7_ENT +from .attrs import FLAG54 as B8_ENT +from .attrs import FLAG53 as B9_ENT +from .attrs import FLAG52 as B10_ENT + +from .attrs import FLAG51 as I3_ENT +from .attrs import FLAG50 as I4_ENT +from .attrs import FLAG49 as I5_ENT +from .attrs import FLAG48 as I6_ENT +from .attrs import FLAG47 as I7_ENT +from .attrs import FLAG46 as I8_ENT +from .attrs import FLAG45 as I9_ENT +from .attrs import FLAG44 as I10_ENT + +from .attrs import FLAG43 as L2_ENT +from .attrs import FLAG42 as L3_ENT +from .attrs import FLAG41 as L4_ENT +from .attrs import FLAG40 as L5_ENT +from .attrs import FLAG39 as L6_ENT +from .attrs import FLAG38 as L7_ENT +from .attrs import FLAG37 as L8_ENT +from .attrs import FLAG36 as L9_ENT +from .attrs import FLAG35 as L10_ENT + + try: import ujson as json except ImportError: @@ -41,7 +80,7 @@ cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) exc pattern[i].spec[j].attr = attr pattern[i].spec[j].value = value i = len(token_specs) - pattern[i].spec = mem.alloc(1, sizeof(AttrValue)) + pattern[i].spec = mem.alloc(2, sizeof(AttrValue)) pattern[i].spec[0].attr = ENT_TYPE pattern[i].spec[0].value = entity_type pattern[i].spec[1].attr = LENGTH @@ -81,7 +120,33 @@ def _convert_strings(token_specs, string_store): value = int(value) converted[-1].append((attr, value)) return converted - + + +def get_bilou(length): + if length == 1: + return [U_ENT] + elif length == 2: + return [B2_ENT, L2_ENT] + elif length == 3: + return [B3_ENT, I3_ENT, L3_ENT] + elif length == 4: + return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] + elif length == 5: + return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] + elif length == 6: + return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] + elif length == 7: + return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] + elif length == 8: + return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] + elif length == 9: + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] + elif length == 10: + return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, + I10_ENT, I10_ENT, L10_ENT] + else: + raise ValueError("Max length currently 10 for phrase matching") + def map_attr_name(attr): attr = attr.upper() @@ -95,32 +160,6 @@ def map_attr_name(attr): return SHAPE elif attr == 'NORM': return NORM - elif attr == 'FLAG13': - return FLAG13 - elif attr == 'FLAG14': - return FLAG14 - elif attr == 'FLAG15': - return FLAG15 - elif attr == 'FLAG16': - return FLAG16 - elif attr == 'FLAG17': - return FLAG17 - elif attr == 'FLAG18': - return FLAG18 - elif attr == 'FLAG19': - return FLAG19 - elif attr == 'FLAG20': - return FLAG20 - elif attr == 'FLAG21': - return FLAG21 - elif attr == 'FLAG22': - return FLAG22 - elif attr == 'FLAG23': - return FLAG23 - elif attr == 'FLAG24': - return FLAG24 - elif attr == 'FLAG25': - return FLAG25 else: raise Exception("TODO: Finish supporting attr mapping %s" % attr) @@ -163,7 +202,7 @@ cdef class Matcher: spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) - def __call__(self, Doc doc): + def __call__(self, Doc doc, acceptor=None): cdef vector[Pattern*] partials cdef int n_partials = 0 cdef int q = 0 @@ -174,21 +213,94 @@ cdef class Matcher: for token_i in range(doc.length): token = &doc.data[token_i] q = 0 + # Go over the open matches, extending or finalizing if able. Otherwise, + # we over-write them (q doesn't advance) for i in range(partials.size()): state = partials.at(i) if match(state, token): if is_final(state): - matches.append(get_entity(state, token, token_i)) + label, start, end = get_entity(state, token, token_i) + if acceptor is None or acceptor(doc, label, start, end): + matches.append((label, start, end)) else: partials[q] = state + 1 q += 1 partials.resize(q) + # Check whether we open any new patterns on this token for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): if is_final(state): - matches.append(get_entity(state, token, token_i)) + label, start, end = get_entity(state, token, token_i) + if acceptor is None or acceptor(doc, label, start, end): + matches.append((label, start, end)) else: partials.push_back(state + 1) doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches return matches + + +cdef class PhraseMatcher: + cdef Pool mem + cdef Vocab vocab + cdef Matcher matcher + cdef PreshMap phrase_ids + + cdef int max_length + cdef attr_t* _phrase_key + + def __init__(self, Vocab vocab, phrases, max_length=10): + self.mem = Pool() + self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) + self.max_length = max_length + self.vocab = vocab + self.matcher = Matcher(self.vocab, {}) + self.phrase_ids = PreshMap() + for phrase in phrases: + if len(phrase) < max_length: + self.add(phrase) + + abstract_patterns = [] + for length in range(1, max_length): + abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) + self.matcher.add('Candidate', 'MWE', {}, abstract_patterns) + + def add(self, Doc tokens): + cdef int length = tokens.length + assert length < self.max_length + tags = get_bilou(length) + assert len(tags) == length, length + + cdef int i + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, tag in enumerate(tags): + lexeme = self.vocab[tokens.data[i].lex.orth] + lexeme.set_flag(tag, True) + self._phrase_key[i] = lexeme.orth + cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) + self.phrase_ids[key] = True + + def __call__(self, Doc doc): + matches = [] + for label, start, end in self.matcher(doc, acceptor=self.accept_match): + cand = doc[start : end] + start = cand[0].idx + end = cand[-1].idx + len(cand[-1]) + matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) + for match in matches: + doc.merge(*match) + return matches + + def accept_match(self, Doc doc, int label, int start, int end): + assert (end - start) < self.max_length + cdef int i, j + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, j in enumerate(range(start, end)): + self._phrase_key[i] = doc.data[j].lex.orth + cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) + if self.phrase_ids.get(key): + return True + else: + return False From 4bbc8f45c6e35b11744a896503568f888653f4bf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 02:02:37 +1100 Subject: [PATCH 18/62] * Fix multi word matcher --- examples/multi_word_matches.py | 156 ++++++++++----------------------- 1 file changed, 45 insertions(+), 111 deletions(-) diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py index 59d3c2a63..3c715736e 100644 --- a/examples/multi_word_matches.py +++ b/examples/multi_word_matches.py @@ -26,137 +26,71 @@ from ast import literal_eval from bz2 import BZ2File import time import math +import codecs import plac from preshed.maps import PreshMap +from preshed.counter import PreshCounter from spacy.strings import hash_string from spacy.en import English -from spacy.matcher import Matcher - -from spacy.attrs import FLAG63 as B_ENT -from spacy.attrs import FLAG62 as L_ENT -from spacy.attrs import FLAG61 as I_ENT - -from spacy.attrs import FLAG60 as B2_ENT -from spacy.attrs import FLAG59 as B3_ENT -from spacy.attrs import FLAG58 as B4_ENT -from spacy.attrs import FLAG57 as B5_ENT -from spacy.attrs import FLAG56 as B6_ENT -from spacy.attrs import FLAG55 as B7_ENT -from spacy.attrs import FLAG54 as B8_ENT -from spacy.attrs import FLAG53 as B9_ENT -from spacy.attrs import FLAG52 as B10_ENT - -from spacy.attrs import FLAG51 as I3_ENT -from spacy.attrs import FLAG50 as I4_ENT -from spacy.attrs import FLAG49 as I5_ENT -from spacy.attrs import FLAG48 as I6_ENT -from spacy.attrs import FLAG47 as I7_ENT -from spacy.attrs import FLAG46 as I8_ENT -from spacy.attrs import FLAG45 as I9_ENT -from spacy.attrs import FLAG44 as I10_ENT - -from spacy.attrs import FLAG43 as L2_ENT -from spacy.attrs import FLAG42 as L3_ENT -from spacy.attrs import FLAG41 as L4_ENT -from spacy.attrs import FLAG40 as L5_ENT -from spacy.attrs import FLAG39 as L6_ENT -from spacy.attrs import FLAG38 as L7_ENT -from spacy.attrs import FLAG37 as L8_ENT -from spacy.attrs import FLAG36 as L9_ENT -from spacy.attrs import FLAG35 as L10_ENT +from spacy.matcher import PhraseMatcher -def get_bilou(length): - if length == 1: - return [U_ENT] - elif length == 2: - return [B2_ENT, L2_ENT] - elif length == 3: - return [B3_ENT, I3_ENT, L3_ENT] - elif length == 4: - return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] - elif length == 5: - return [B5_ENT, I5_ENT, I5_ENT, L5_ENT] - elif length == 6: - return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] - elif length == 7: - return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] - elif length == 8: - return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] - elif length == 9: - return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] - elif length == 10: - return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, L10_ENT] - - -def make_matcher(vocab, max_length): - abstract_patterns = [] - for length in range(2, max_length): - abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) - return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)}) - - -def get_matches(matcher, pattern_ids, doc): - matches = [] - for label, start, end in matcher(doc): - candidate = doc[start : end] - if pattern_ids[hash_string(candidate.text)] == True: - start = candidate[0].idx - end = candidate[-1].idx + len(candidate[-1]) - matches.append((start, end, candidate.root.tag_, candidate.text)) - return matches - - -def merge_matches(doc, matches): - for start, end, tag, text in matches: - doc.merge(start, end, tag, text, 'MWE') - - -def read_gazetteer(loc): - for line in open(loc): +def read_gazetteer(tokenizer, loc, n=-1): + for i, line in enumerate(open(loc)): phrase = literal_eval('u' + line.strip()) if ' (' in phrase and phrase.endswith(')'): phrase = phrase.split(' (', 1)[0] - yield phrase + if i >= n: + break + phrase = tokenizer(phrase) + if len(phrase) >= 2: + yield phrase + def read_text(bz2_loc): with BZ2File(bz2_loc) as file_: for line in file_: yield line.decode('utf8') -def main(patterns_loc, text_loc): + +def get_matches(tokenizer, phrases, texts, max_length=6): + matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) + print("Match") + for text in texts: + doc = tokenizer(text) + matches = matcher(doc) + for mwe in doc.ents: + yield mwe + + +def main(patterns_loc, text_loc, counts_loc, n=10000000): nlp = English(parser=False, tagger=False, entity=False) - - pattern_ids = PreshMap() - max_length = 10 - i = 0 - for pattern_str in read_gazetteer(patterns_loc): - pattern = nlp.tokenizer(pattern_str) - if len(pattern) < 2 or len(pattern) >= max_length: - continue - bilou_tags = get_bilou(len(pattern)) - for word, tag in zip(pattern, bilou_tags): - lexeme = nlp.vocab[word.orth] - lexeme.set_flag(tag, True) - pattern_ids[hash_string(pattern.text)] = True - i += 1 - if i >= 10000001: - break - - matcher = make_matcher(nlp.vocab, max_length) - + print("Make matcher") + phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) + counts = PreshCounter() t1 = time.time() - - for text in read_text(text_loc): - doc = nlp.tokenizer(text) - matches = get_matches(matcher, pattern_ids, doc) - merge_matches(doc, matches) + for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): + counts.inc(hash_string(mwe.text), 1) t2 = time.time() - print('10 ^ %d patterns took %d s' % (round(math.log(i, 10)), t2-t1)) - + print("10m tokens in %d s" % (t2 - t1)) + + with codecs.open(counts_loc, 'w', 'utf8') as file_: + for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): + text = phrase.string + key = hash_string(text) + count = counts[key] + if count != 0: + file_.write('%d\t%s\n' % (count, text)) if __name__ == '__main__': - plac.call(main) + if False: + import cProfile + import pstats + cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") + s = pstats.Stats("Profile.prof") + s.strip_dirs().sort_stats("time").print_stats() + else: + plac.call(main) From 5af4b62fe731758ae2b20fbd737a558f457ea6b9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 12:47:43 +1100 Subject: [PATCH 19/62] * Filter out phrases that consist of common, lower-case words. --- examples/multi_word_matches.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py index 3c715736e..73f48bf42 100644 --- a/examples/multi_word_matches.py +++ b/examples/multi_word_matches.py @@ -45,6 +45,8 @@ def read_gazetteer(tokenizer, loc, n=-1): if i >= n: break phrase = tokenizer(phrase) + if all((t.is_lower and t.prob >= -10) for t in phrase): + continue if len(phrase) >= 2: yield phrase From c64fd472b033f9551e89a74fe2851c6d3335c137 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 12:58:08 +1100 Subject: [PATCH 20/62] * Fix travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1ea1f8375..f21301db1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ install: # run tests script: - - "py.test tests/ website/tests/ -x" + - "py.test tests/ -x" From 9ff288c7bba283d914ca70c62d3278a720f800b7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 13:37:25 +1100 Subject: [PATCH 21/62] * Update tests, after removal of spacy.en.attrs --- tests/matcher/test_matcher_bugfixes.py | 1 + tests/tokens/test_array.py | 2 +- tests/tokens/test_token_api.py | 6 +++--- tests/vocab/test_lexeme_flags.py | 2 +- tests/website/test_api.py | 2 +- tests/website/test_home.py | 4 ++-- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/matcher/test_matcher_bugfixes.py b/tests/matcher/test_matcher_bugfixes.py index c768021db..b65541460 100644 --- a/tests/matcher/test_matcher_bugfixes.py +++ b/tests/matcher/test_matcher_bugfixes.py @@ -3,6 +3,7 @@ import pytest from spacy.matcher import Matcher +@pytest.mark.xfail def test_overlap_issue118(EN): '''Test a bug that arose from having overlapping matches''' doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') diff --git a/tests/tokens/test_array.py b/tests/tokens/test_array.py index 29807c3e5..bdfdfd057 100644 --- a/tests/tokens/test_array.py +++ b/tests/tokens/test_array.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest -from spacy.en import attrs +from spacy import attrs def test_attr_of_token(EN): diff --git a/tests/tokens/test_token_api.py b/tests/tokens/test_token_api.py index 99c99fc11..6deaadfbf 100644 --- a/tests/tokens/test_token_api.py +++ b/tests/tokens/test_token_api.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals from spacy.en import English -from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT -from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM -from spacy.en.attrs import IS_STOP +from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT +from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM +from spacy.attrs import IS_STOP import pytest diff --git a/tests/vocab/test_lexeme_flags.py b/tests/vocab/test_lexeme_flags.py index 844ee0aaa..5cc7bd16f 100644 --- a/tests/vocab/test_lexeme_flags.py +++ b/tests/vocab/test_lexeme_flags.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from spacy.en.attrs import * +from spacy.attrs import * def test_is_alpha(en_vocab): diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 4ef1a54aa..37a48794b 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -60,7 +60,7 @@ def test_count_by(nlp): # from spacy.en import English, attrs # nlp = English() import numpy - from spacy.en import attrs + from spacy import attrs tokens = nlp('apple apple orange banana') assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1} assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529], diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 515c64e6c..7d822d377 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals import pytest -import spacy.en +import spacy @pytest.fixture() @@ -45,7 +45,7 @@ def test_get_and_set_string_views_and_flags(nlp, token): def test_export_to_numpy_arrays(nlp, doc): - from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV + from spacy.attrs import ORTH, LIKE_URL, IS_OOV attr_ids = [ORTH, LIKE_URL, IS_OOV] doc_array = doc.to_array(attr_ids) From b125289f304ecbf47b825904971a9989a77d22d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 13:46:57 +1100 Subject: [PATCH 22/62] * Fix type declaration in asciied function --- spacy/orth.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 27123bb4e..882e06bf2 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -11,6 +11,7 @@ try: except ImportError: from text_unidecode import unidecode + import re import math @@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0): cpdef bytes asciied(unicode string): - cdef str stripped = unidecode(string) + stripped = unidecode(string) if not stripped: return b'???' return stripped.encode('ascii') From 20b8c3e28172678bf497cca394adec3fdef990d1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 13:58:01 +1100 Subject: [PATCH 23/62] * Mark tests that require models --- tests/website/test_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 37a48794b..50ec73827 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -26,6 +26,7 @@ def test_main_entry_point(nlp): doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. +@pytest.mark.models def test_sentence_spans(nlp): # from spacy.en import English # nlp = English() @@ -33,6 +34,7 @@ def test_sentence_spans(nlp): assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] +@pytest.mark.models def test_entity_spans(nlp): # from spacy.en import English # nlp = English() @@ -44,6 +46,7 @@ def test_entity_spans(nlp): assert ents[0].string == ents[0].string +@pytest.mark.models def test_noun_chunk_spans(nlp): # from spacy.en import English # nlp = English() From 7b340912d4a433d007e9397ace340f6ef652bef9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:09:26 +1100 Subject: [PATCH 24/62] * Mark tests that require models --- tests/website/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 50ec73827..8b52ffff6 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -59,6 +59,7 @@ def test_noun_chunk_spans(nlp): # NP three noun chunks <-- has +@pytest.mark.models def test_count_by(nlp): # from spacy.en import English, attrs # nlp = English() From 76936a345617d85e964b227322dee3cc41554f58 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:19:07 +1100 Subject: [PATCH 25/62] * Mark tests that require models --- tests/website/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 8b52ffff6..49f661850 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -92,6 +92,7 @@ def test_token_span(doc): assert token.i == 4 +@pytest.mark.models def test_example_i_like_new_york1(nlp): toks = nlp('I like New York in Autumn.') From 5031440c35e06afd3fc4e859641063a34acb126f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:29:28 +1100 Subject: [PATCH 26/62] * Mark tests that require models --- tests/website/test_api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 49f661850..52910ae41 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -132,6 +132,7 @@ def dot(toks): return tok(toks, "dot") +@pytest.mark.models def test_example_i_like_new_york3(toks, new, york): assert toks[new].head.orth_ == 'York' assert toks[york].head.orth_ == 'like' @@ -142,6 +143,7 @@ def test_example_i_like_new_york4(toks, new, york): assert new_york.root.orth_ == 'York' +@pytest.mark.models def test_example_i_like_new_york5(toks, autumn, dot): assert toks[autumn].head.orth_ == 'in' assert toks[dot].head.orth_ == 'like' @@ -149,6 +151,7 @@ def test_example_i_like_new_york5(toks, autumn, dot): assert autumn_dot.root.orth_ == 'Autumn' +@pytest.mark.models def test_navigating_the_parse_tree_lefts(doc): # TODO: where does the span object come from? span = doc[:2] @@ -156,6 +159,7 @@ def test_navigating_the_parse_tree_lefts(doc): if span.doc[i].head in span] +@pytest.mark.models def test_navigating_the_parse_tree_rights(doc): span = doc[:2] rights = [span.doc[i] for i in range(span.end, len(span.doc)) From dea40cfec34298e869a3d1241eb8f58ea11cef5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:37:48 +1100 Subject: [PATCH 27/62] * Mark tests that require models --- tests/website/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 52910ae41..ef0365d88 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -138,6 +138,7 @@ def test_example_i_like_new_york3(toks, new, york): assert toks[york].head.orth_ == 'like' +@pytest.mark.models def test_example_i_like_new_york4(toks, new, york): new_york = toks[new:york+1] assert new_york.root.orth_ == 'York' From 00c1992503203eb4f20a54841d6b59b3ce5da7dc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:48:14 +1100 Subject: [PATCH 28/62] * Mark tests that require models --- tests/website/test_home.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 7d822d377..6c97b0f31 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -22,6 +22,7 @@ def test_get_tokens_and_sentences(doc): assert sentence.text == 'Hello, world.' +@pytest.mark.models def test_use_integer_ids_for_any_strings(nlp, token): hello_id = nlp.vocab.strings['Hello'] hello_str = nlp.vocab.strings[hello_id] @@ -68,6 +69,7 @@ def test_word_vectors(nlp): assert apples.similarity(oranges) > boots.similarity(hippos) +@pytest.mark.models def test_part_of_speech_tags(nlp): from spacy.parts_of_speech import ADV From 4bae38128d5ef0487526239cfb0626c72d803984 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:58:34 +1100 Subject: [PATCH 29/62] * Remove license page from website in repo --- website/src/jade/license/index.jade | 38 ----------------------------- 1 file changed, 38 deletions(-) delete mode 100644 website/src/jade/license/index.jade diff --git a/website/src/jade/license/index.jade b/website/src/jade/license/index.jade deleted file mode 100644 index b31e99949..000000000 --- a/website/src/jade/license/index.jade +++ /dev/null @@ -1,38 +0,0 @@ -include ../header.jade - -mixin LicenseOption(name, period, price, audience) - .item - h4 #{name} - - .focus #{period} - - span #{price} - - h5 Suggested for: - - span #{audience} - - a.button(href="/resources/pdf/spaCy_License_Agreement_2015.pdf", target="_blank") Download license - - span or #[a(href="mailto:sales@spacy.io") get in touch] - -- var Page = InitPage(Site, Authors.spacy, "license", "License") - -+WritePage(Site, Authors.spacy, Page) - article.pricing - .box.license - +LicenseOption("Trial", "90 days", "$0", "Evaluation") - +LicenseOption("Production", "1 year", "$5,000", "Production") - +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning") - - p.caption Researcher, hobbyist, or open-source developer? spaCy also offers #[a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3] licenses. - - blockquote.pull-quote - p Let's face it: Services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt. - - p You need the source, and you need to know you can buy a long-term license. So that's what we offer. The difference between this and a black-box API is night and day. - - p Let's face it: services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt. Open-source projects become abandoned or bloated. Google's graveyard is over-flowing – ditto for Yahoo!, Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset? - - p A 5 year license won't expire until 2020. spaCy will be with you for longer than most of your current staff. If that's still not enough, get in touch. We can surely work something out. - From c5b2c4ead8d3c787e15cb6c451abd65ba8039593 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 14:58:45 +1100 Subject: [PATCH 30/62] * Don't build old license page --- website/Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/website/Makefile b/website/Makefile index 78dc9448c..940a8182c 100644 --- a/website/Makefile +++ b/website/Makefile @@ -12,9 +12,6 @@ site/index.html: src/jade/header.jade src/jade/*.jade site/docs/: src/jade/docs/*.jade src/jade/header.jade jade -P src/jade/docs/index.jade --out $@ -site/license/: src/jade/license/*.jade src/jade/header.jade - jade -P src/jade/license/index.jade --out $@ - site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade jade -P src/jade/blog/index.jade --out $@ From 0e13f18ea4ca3916fff068b85b13454d4f72daf2 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 9 Oct 2015 07:23:39 +0200 Subject: [PATCH 31/62] remove compile warning noise --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0c05d890b..a7c27fb74 100644 --- a/setup.py +++ b/setup.py @@ -138,7 +138,7 @@ VERSION = '0.93' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] - compile_args = ['-O3', '-Wno-strict-prototypes'] + compile_args = ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] link_args = [] if sys.prefix == 'darwin': compile_args.append(['-mmacosx-version-min=10.8', '-stdlib=libc++']) From b71ba2eed517942765956750b0afa7fc73849c55 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 18:43:14 +1100 Subject: [PATCH 32/62] * Add tests for unicode puncuation character lemmatization --- tests/tagger/test_lemmatizer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index 8461a854e..df553c3d6 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals from spacy.lemmatizer import Lemmatizer, read_index, read_exc @@ -34,3 +35,14 @@ def test_noun_lemmas(lemmatizer): assert do('planets') == set(['planet']) assert do('ring') == set(['ring']) assert do('axes') == set(['axis', 'axe', 'ax']) + + +def test_smart_quotes(lemmatizer): + do = lemmatizer.punct + assert do('“') == set(['``']) + assert do('“') == set(['``']) + + +def test_smart_quotes(lemmatizer): + do = lemmatizer.punct + assert do('–') == set(["--"]) From 5332c0b697f68eb7d25221dc722d1d5ee65a479e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 18:44:21 +1100 Subject: [PATCH 33/62] * Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue #130 --- lang_data/en/lemma_rules.json | 6 ++++++ spacy/lemmatizer.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index c45eb1df6..498240be1 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -27,5 +27,11 @@ ["est", ""], ["er", "e"], ["est", "e"] + ], + + "punct": [ + ["“", "``"], + ["”", "''"], + ["–", "--"] ] } diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index ed04e2d77..c1d296d7c 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function from os import path import codecs @@ -7,7 +7,7 @@ try: except ImportError: import json -from .parts_of_speech import NOUN, VERB, ADJ +from .parts_of_speech import NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @@ -36,6 +36,8 @@ class Lemmatizer(object): pos = 'verb' elif pos == ADJ: pos = 'adj' + elif pos == PUNCT: + pos = 'punct' lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) return lemmas @@ -48,6 +50,9 @@ class Lemmatizer(object): def adj(self, string): return self(string, 'adj') + def punct(self, string): + return self(string, 'punct') + def lemmatize(string, index, exceptions, rules): string = string.lower() @@ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules): for old, new in rules: if string.endswith(old): form = string[:len(string) - len(old)] + new - if form in index: + if form in index or not form.isalpha(): forms.append(form) if not forms: forms.append(string) From 2d9e5bf566be7a9a6706d1ef2b01f63a2bde7f2e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:02:42 +1100 Subject: [PATCH 34/62] * Allow punctuation to be lemmatized --- spacy/morphology.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 1a499aa0a..534f64a59 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -7,7 +7,7 @@ except ImportError: import json from .parts_of_speech import UNIV_POS_NAMES -from .parts_of_speech cimport ADJ, VERB, NOUN +from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT cdef class Morphology: @@ -81,7 +81,7 @@ cdef class Morphology: if self.lemmatizer is None: return orth cdef unicode py_string = self.strings[orth] - if pos != NOUN and pos != VERB and pos != ADJ: + if pos != NOUN and pos != VERB and pos != ADJ and pos != PUNCT: return orth cdef set lemma_strings cdef unicode lemma_string From 1842a53e73405be3048e6dd26afcfc2e4d5da5ee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:09:36 +1100 Subject: [PATCH 35/62] * Lemmatize smart quotes as plain quotes --- lang_data/en/lemma_rules.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index 498240be1..0336b6b9f 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -30,8 +30,8 @@ ], "punct": [ - ["“", "``"], - ["”", "''"], + ["“", "\""], + ["”", "\""], ["–", "--"] ] } From 1490feda292d8065c01bb2136be5c30bbf5b23eb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:23:47 +1100 Subject: [PATCH 36/62] * Make generate_specials pretty-print the specials.json file --- lang_data/en/generate_specials.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 1a8f1ae0b..6ad503aec 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -1,3 +1,4 @@ +# -#- coding: utf-8 -*- import json contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"} @@ -132,7 +133,6 @@ hardcoded_specials = { "Mt.": [{"F": "Mt.", "L": "Mount"}], "''": [{"F": "''"}], - "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], "Co.": [{"F": "Co."}], @@ -412,6 +412,6 @@ def generate_specials(): if __name__ == "__main__": specials = generate_specials() - with open("specials.json", "w") as f: - json.dump(specials, f) + with open("specials.json", "w") as file_: + file_.write(json.dumps(specials, indent=2)) From 393a13d1af2a0c22a04643e61e7c4b95b653250b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 19:24:33 +1100 Subject: [PATCH 37/62] * Add unicode em dash to specials.json, so that we can control what POS tag it gets. This way we can prevent sentence boundary detection errors, to address Issue #130. --- lang_data/en/generate_specials.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 6ad503aec..e50cd77d4 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -133,6 +133,9 @@ hardcoded_specials = { "Mt.": [{"F": "Mt.", "L": "Mount"}], "''": [{"F": "''"}], + + "—": [{"F": "—", "L": "--", "P": ":"}], + "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], "Co.": [{"F": "Co."}], From b6047afe4ca23e48fef28c08c40e91a6bd9c61b6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 10:25:37 +0200 Subject: [PATCH 38/62] * Fix punctuation lemma rules, to resolve Issue #130 --- lang_data/en/lemma_rules.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index 498240be1..5a431224d 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -30,8 +30,8 @@ ], "punct": [ - ["“", "``"], - ["”", "''"], - ["–", "--"] + ["“", "\""], + ["”", "\""], + ["—", "--"] ] } From 0e92e8574ac4345f2b5d18a3dd182eaa69e43466 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 11:06:37 +0200 Subject: [PATCH 39/62] * Fix pos tag in em-dash in specials --- lang_data/en/generate_specials.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index e50cd77d4..db3827593 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -134,7 +134,7 @@ hardcoded_specials = { "''": [{"F": "''"}], - "—": [{"F": "—", "L": "--", "P": ":"}], + "—": [{"F": "—", "L": "--", "pos": ":"}], "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], From 49600a44a84dee1ac044578d99c02ffb50cd8b27 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 11:06:57 +0200 Subject: [PATCH 40/62] * Fix trailing comma in lemma_rules.json --- lang_data/en/lemma_rules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index 140749b18..1d7366f92 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -31,6 +31,6 @@ "punct": [ ["“", "\""], - ["”", "\""], + ["”", "\""] ] } From a510858f5a516447fa050223fe27773f4c55fa79 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 11:07:45 +0200 Subject: [PATCH 41/62] * Pretty-print specials.json, and add the em dash --- lang_data/en/specials.json | 4864 +++++++++++++++++++++++++++++++++++- 1 file changed, 4863 insertions(+), 1 deletion(-) diff --git a/lang_data/en/specials.json b/lang_data/en/specials.json index 93672dc10..20d90e261 100644 --- a/lang_data/en/specials.json +++ b/lang_data/en/specials.json @@ -1 +1,4863 @@ -{"i've": [{"L": "-PRON-", "F": "i"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Where's": [{"F": "Where"}, {"F": "'s"}], "4p.m.": [{"F": "4"}, {"F": "p.m."}], "12am": [{"F": "12"}, {"L": "a.m.", "F": "am"}], "j.": [{"F": "j."}], "8pm": [{"F": "8"}, {"L": "p.m.", "F": "pm"}], "E.G.": [{"F": "E.G."}], "must've": [{"F": "must"}, {"L": "have", "pos": "VB", "F": "'ve"}], "D.C.": [{"F": "D.C."}], "She'd've": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "mightnt": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "nt"}], "Hes": [{"L": "-PRON-", "F": "He"}, {"F": "s"}], "7a.m.": [{"F": "7"}, {"F": "a.m."}], "Idve": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ill.": [{"F": "Ill."}], ":P": [{"F": ":P"}], "\t": [{"pos": "SP", "F": "\t"}], "10a.m.": [{"F": "10"}, {"F": "a.m."}], "would've": [{"F": "would"}, {"L": "have", "pos": "VB", "F": "'ve"}], "11am": [{"F": "11"}, {"L": "a.m.", "F": "am"}], "you'd": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "'d"}], "Thered": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "d"}], "havent": [{"pos": "VB", "F": "have"}, {"L": "not", "pos": "RB", "F": "nt"}], "im": [{"L": "-PRON-", "F": "i"}, {"L": "be", "F": "m", "pos": "VBP", "tenspect": 1, "number": 1}], "Whatll": [{"F": "What"}, {"L": "will", "pos": "MD", "F": "ll"}], "there'd": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "'d"}], "Mustn't": [{"F": "Must"}, {"L": "not", "pos": "RB", "F": "n't"}], "haven't": [{"pos": "VB", "F": "have"}, {"L": "not", "pos": "RB", "F": "n't"}], "hows": [{"F": "how"}, {"F": "s"}], "Doesn't": [{"L": "do", "pos": "VBZ", "F": "Does"}, {"L": "not", "pos": "RB", "F": "n't"}], "You're": [{"L": "-PRON-", "F": "You"}, {"F": "'re"}], "he's": [{"L": "-PRON-", "F": "he"}, {"F": "'s"}], "Mo.": [{"F": "Mo."}], "Theydve": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "We're": [{"F": "We"}, {"F": "'re"}], "can't": [{"L": "can", "pos": "MD", "F": "ca"}, {"L": "not", "pos": "RB", "F": "n't"}], "they've": [{"L": "-PRON-", "F": "they"}, {"L": "have", "pos": "VB", "F": "'ve"}], "werent": [{"F": "were"}, {"L": "not", "pos": "RB", "F": "nt"}], "i'm": [{"L": "-PRON-", "F": "i"}, {"L": "be", "F": "'m", "pos": "VBP", "tenspect": 1, "number": 1}], "Wouldve": [{"F": "Would"}, {"L": "have", "pos": "VB", "F": "ve"}], "Inc.": [{"F": "Inc."}], "Isnt": [{"L": "be", "pos": "VBZ", "F": "Is"}, {"L": "not", "pos": "RB", "F": "nt"}], "mightn't": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "n't"}], "itd": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "d"}], "^_^": [{"F": "^_^"}], "4pm": [{"F": "4"}, {"L": "p.m.", "F": "pm"}], "theyd": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "d"}], "p.": [{"F": "p."}], "Hasnt": [{"F": "Has"}, {"L": "not", "pos": "RB", "F": "nt"}], "how'd": [{"F": "how"}, {"L": "would", "pos": "MD", "F": "'d"}], "you'll": [{"L": "-PRON-", "F": "you"}, {"L": "will", "pos": "MD", "F": "'ll"}], "how's": [{"F": "how"}, {"F": "'s"}], "e.g.": [{"F": "e.g."}], "didn't": [{"L": "do", "pos": "VBD", "F": "did"}, {"L": "not", "pos": "RB", "F": "n't"}], "6pm": [{"F": "6"}, {"L": "p.m.", "F": "pm"}], "z.": [{"F": "z."}], "Howll": [{"F": "How"}, {"L": "will", "pos": "MD", "F": "ll"}], "Shant": [{"F": "Sha"}, {"L": "not", "pos": "RB", "F": "nt"}], "Theyd": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "d"}], "f.": [{"F": "f."}], "u.": [{"F": "u."}], "she'd": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "'d"}], "Fla.": [{"F": "Fla."}], "Rep.": [{"F": "Rep."}], "they're": [{"L": "-PRON-", "F": "they"}, {"F": "'re"}], "you'd've": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mightve": [{"F": "Might"}, {"L": "have", "pos": "VB", "F": "ve"}], "Why'll": [{"F": "Why"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Should've": [{"F": "Should"}, {"L": "have", "pos": "VB", "F": "'ve"}], "that's": [{"F": "that"}, {"F": "'s"}], "9pm": [{"F": "9"}, {"L": "p.m.", "F": "pm"}], "Mass.": [{"F": "Mass."}], "there's": [{"F": "there"}, {"F": "'s"}], "It'd": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "'d"}], "hasn't": [{"F": "has"}, {"L": "not", "pos": "RB", "F": "n't"}], "shes": [{"L": "-PRON-", "F": "she"}, {"F": "s"}], "she'd've": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "o.O": [{"F": "o.O"}], "whered": [{"F": "where"}, {"L": "would", "pos": "MD", "F": "d"}], ":(((": [{"F": ":((("}], "N.C.": [{"F": "N.C."}], "you're": [{"L": "-PRON-", "F": "you"}, {"F": "'re"}], ":0": [{"F": ":0"}], "Wouldn't": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "n't"}], "'em": [{"F": "'em"}], "Whatve": [{"F": "What"}, {"L": "have", "pos": "VB", "F": "ve"}], "Corp.": [{"F": "Corp."}], "i'ma": [{"L": "-PRON-", "F": "i"}, {"F": "'ma"}], "''": [{"F": "''"}], "v.": [{"F": "v."}], "Ga.": [{"F": "Ga."}], "1am": [{"F": "1"}, {"L": "a.m.", "F": "am"}], "Wasnt": [{"F": "Was"}, {"L": "not", "pos": "RB", "F": "nt"}], "q.": [{"F": "q."}], "Hows": [{"F": "How"}, {"F": "s"}], "why're": [{"F": "why"}, {"F": "'re"}], ";-p": [{"F": ";-p"}], "Ima": [{"L": "-PRON-", "F": "I"}, {"F": "ma"}], "neednt": [{"F": "need"}, {"L": "not", "pos": "RB", "F": "nt"}], "Ariz.": [{"F": "Ariz."}], "8am": [{"F": "8"}, {"L": "a.m.", "F": "am"}], "Aren't": [{"L": "be", "pos": "VBP", "F": "Are", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "4am": [{"F": "4"}, {"L": "a.m.", "F": "am"}], "she'll": [{"L": "-PRON-", "F": "she"}, {"L": "will", "pos": "MD", "F": "'ll"}], "8p.m.": [{"F": "8"}, {"F": "p.m."}], "9p.m.": [{"F": "9"}, {"F": "p.m."}], "11p.m.": [{"F": "11"}, {"F": "p.m."}], "Who'd": [{"F": "Who"}, {"L": "would", "pos": "MD", "F": "'d"}], "St.": [{"F": "St."}], "It's": [{"L": "-PRON-", "F": "It"}, {"F": "'s"}], "Gen.": [{"F": "Gen."}], "Messrs.": [{"F": "Messrs."}], "Calif.": [{"F": "Calif."}], "youdve": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "i'll": [{"L": "-PRON-", "F": "i"}, {"L": "will", "pos": "MD", "F": "'ll"}], "whatll": [{"F": "what"}, {"L": "will", "pos": "MD", "F": "ll"}], "mightntve": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldnt": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "nt"}], "Hasn't": [{"F": "Has"}, {"L": "not", "pos": "RB", "F": "n't"}], "hasnt": [{"F": "has"}, {"L": "not", "pos": "RB", "F": "nt"}], "shouldnt": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "nt"}], "Haven't": [{"pos": "VB", "F": "Have"}, {"L": "not", "pos": "RB", "F": "n't"}], "wedve": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Must've": [{"F": "Must"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Minn.": [{"F": "Minn."}], "s.": [{"F": "s."}], "isnt": [{"L": "be", "pos": "VBZ", "F": "is"}, {"L": "not", "pos": "RB", "F": "nt"}], "He'd've": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "o_o": [{"F": "o_o"}], "let's": [{"F": "let"}, {"F": "'s"}], "They've": [{"L": "-PRON-", "F": "They"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Co.": [{"F": "Co."}], "p.m.": [{"F": "p.m."}], "we're": [{"F": "we"}, {"F": "'re"}], "May.": [{"F": "May."}], "Ala.": [{"F": "Ala."}], "10am": [{"F": "10"}, {"L": "a.m.", "F": "am"}], "itll": [{"L": "-PRON-", "F": "it"}, {"L": "will", "pos": "MD", "F": "ll"}], "n.": [{"F": "n."}], "5pm": [{"F": "5"}, {"L": "p.m.", "F": "pm"}], "hedve": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Shan't": [{"F": "Sha"}, {"L": "not", "pos": "RB", "F": "n't"}], "Wont": [{"F": "Wo"}, {"L": "not", "pos": "RB", "F": "nt"}], "'S": [{"L": "'s", "F": "'S"}], ";(": [{"F": ";("}], "Mightn't've": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "needn't": [{"F": "need"}, {"L": "not", "pos": "RB", "F": "n't"}], "Shes": [{"L": "-PRON-", "F": "She"}, {"F": "s"}], "he'll": [{"L": "-PRON-", "F": "he"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Whereve": [{"F": "Where"}, {"L": "have", "pos": "VB", "F": "ve"}], "aint": [{"L": "be", "pos": "VBP", "F": "ai", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "Colo.": [{"F": "Colo."}], "who've": [{"F": "who"}, {"L": "have", "pos": "VB", "F": "'ve"}], "it'd": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "'d"}], "theyll": [{"L": "-PRON-", "F": "they"}, {"L": "will", "pos": "MD", "F": "ll"}], "wont": [{"F": "wo"}, {"L": "not", "pos": "RB", "F": "nt"}], "whyre": [{"F": "why"}, {"F": "re"}], "Nev.": [{"F": "Nev."}], "Dec.": [{"F": "Dec."}], "whereve": [{"F": "where"}, {"L": "have", "pos": "VB", "F": "ve"}], "Cant": [{"L": "can", "pos": "MD", "F": "Ca"}, {"L": "not", "pos": "RB", "F": "nt"}], "1a.m.": [{"F": "1"}, {"F": "a.m."}], "i.e.": [{"F": "i.e."}], "3am": [{"F": "3"}, {"L": "a.m.", "F": "am"}], "Won't": [{"F": "Wo"}, {"L": "not", "pos": "RB", "F": "n't"}], "hes": [{"L": "-PRON-", "F": "he"}, {"F": "s"}], "Let's": [{"F": "Let"}, {"F": "'s"}], "I'll": [{"L": "-PRON-", "F": "I"}, {"L": "will", "pos": "MD", "F": "'ll"}], "We'll": [{"F": "We"}, {"L": "will", "pos": "MD", "F": "'ll"}], "who'd": [{"F": "who"}, {"L": "would", "pos": "MD", "F": "'d"}], "E.g.": [{"F": "E.g."}], "we'd": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "'d"}], "Theyre": [{"L": "-PRON-", "F": "They"}, {"F": "re"}], "She's": [{"L": "-PRON-", "F": "She"}, {"F": "'s"}], "Whod": [{"F": "Who"}, {"L": "would", "pos": "MD", "F": "d"}], "Itll": [{"L": "-PRON-", "F": "It"}, {"L": "will", "pos": "MD", "F": "ll"}], "couldn't've": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "How'd": [{"F": "How"}, {"L": "would", "pos": "MD", "F": "'d"}], "wouldve": [{"F": "would"}, {"L": "have", "pos": "VB", "F": "ve"}], "shan't": [{"F": "sha"}, {"L": "not", "pos": "RB", "F": "n't"}], "8a.m.": [{"F": "8"}, {"F": "a.m."}], "Havent": [{"pos": "VB", "F": "Have"}, {"L": "not", "pos": "RB", "F": "nt"}], "-__-": [{"F": "-__-"}], "6am": [{"F": "6"}, {"L": "a.m.", "F": "am"}], "Hadntve": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "10p.m.": [{"F": "10"}, {"F": "p.m."}], "Might've": [{"F": "Might"}, {"L": "have", "pos": "VB", "F": "'ve"}], "N.M.": [{"F": "N.M."}], "shouldn't": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "n't"}], "(^_^)": [{"F": "(^_^)"}], "x.": [{"F": "x."}], "where've": [{"F": "where"}, {"L": "have", "pos": "VB", "F": "'ve"}], ";)": [{"F": ";)"}], "theydve": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "dont": [{"L": "do", "F": "do"}, {"L": "not", "pos": "RB", "F": "nt"}], "wouldn't": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "n't"}], "g.": [{"F": "g."}], "Who've": [{"F": "Who"}, {"L": "have", "pos": "VB", "F": "'ve"}], "might've": [{"F": "might"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Who's": [{"F": "Who"}, {"F": "'s"}], "Theyve": [{"L": "-PRON-", "F": "They"}, {"L": "have", "pos": "VB", "F": "ve"}], "2p.m.": [{"F": "2"}, {"F": "p.m."}], "shouldn't've": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "hed": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "d"}], "1p.m.": [{"F": "1"}, {"F": "p.m."}], "We've": [{"F": "We"}, {"L": "have", "pos": "VB", "F": "'ve"}], "a.": [{"F": "a."}], "<333": [{"F": "<333"}], "l.": [{"F": "l."}], "It'll": [{"L": "-PRON-", "F": "It"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Jun.": [{"F": "Jun."}], "Mrs.": [{"F": "Mrs."}], "what's": [{"F": "what"}, {"F": "'s"}], "N.Y.": [{"F": "N.Y."}], "Why're": [{"F": "Why"}, {"F": "'re"}], "Wis.": [{"F": "Wis."}], "Hedve": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Kans.": [{"F": "Kans."}], "idve": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "We'd've": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Dont": [{"L": "do", "F": "Do"}, {"L": "not", "pos": "RB", "F": "nt"}], ":')": [{"F": ":')"}], "(=": [{"F": "(="}], "won't": [{"F": "wo"}, {"L": "not", "pos": "RB", "F": "n't"}], "who'll": [{"F": "who"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Not've": [{"L": "not", "pos": "RB", "F": "Not"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Gov.": [{"F": "Gov."}], "couldntve": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Doesnt": [{"L": "do", "pos": "VBZ", "F": "Does"}, {"L": "not", "pos": "RB", "F": "nt"}], "11a.m.": [{"F": "11"}, {"F": "a.m."}], "I.e.": [{"F": "I.e."}], "wasn't": [{"F": "was"}, {"L": "not", "pos": "RB", "F": "n't"}], "5am": [{"F": "5"}, {"L": "a.m.", "F": "am"}], "Shouldve": [{"F": "Should"}, {"L": "have", "pos": "VB", "F": "ve"}], "Jan.": [{"F": "Jan."}], "she's": [{"L": "-PRON-", "F": "she"}, {"F": "'s"}], "We'd": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "'d"}], "Itd": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "d"}], "What's": [{"F": "What"}, {"F": "'s"}], "e.": [{"F": "e."}], "7p.m.": [{"F": "7"}, {"F": "p.m."}], "Wholl": [{"F": "Who"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadntve": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Where'd": [{"F": "Where"}, {"L": "would", "pos": "MD", "F": "'d"}], ":-)": [{"F": ":-)"}], "whos": [{"F": "who"}, {"F": "s"}], "mustn't": [{"F": "must"}, {"L": "not", "pos": "RB", "F": "n't"}], "shouldntve": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Youdve": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "mustnt": [{"F": "must"}, {"L": "not", "pos": "RB", "F": "nt"}], "Oct.": [{"F": "Oct."}], "a.m.": [{"F": "a.m."}], "wouldn't've": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "k.": [{"F": "k."}], "Hadn't've": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "who're": [{"F": "who"}, {"F": "'re"}], "6a.m.": [{"F": "6"}, {"F": "a.m."}], "Rev.": [{"F": "Rev."}], "Del.": [{"F": "Del."}], "Ind.": [{"F": "Ind."}], "couldn't": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "n't"}], "La.": [{"F": "La."}], "It'd've": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "t.": [{"F": "t."}], "don't": [{"L": "do", "F": "do"}, {"L": "not", "pos": "RB", "F": "n't"}], "Mightnt": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "nt"}], ":3": [{"F": ":3"}], "shouldve": [{"F": "should"}, {"L": "have", "pos": "VB", "F": "ve"}], "notve": [{"L": "not", "pos": "RB", "F": "not"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldn't've": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Aint": [{"L": "be", "pos": "VBP", "F": "Ai", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "wheres": [{"F": "where"}, {"F": "s"}], "Don't": [{"L": "do", "F": "Do"}, {"L": "not", "pos": "RB", "F": "n't"}], "Theredve": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Could've": [{"pos": "MD", "F": "Could"}, {"L": "have", "pos": "VB", "F": "'ve"}], "d.": [{"F": "d."}], "Wouldnt": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "nt"}], "They're": [{"L": "-PRON-", "F": "They"}, {"F": "'re"}], "There's": [{"F": "There"}, {"F": "'s"}], "Mr.": [{"F": "Mr."}], "shant": [{"F": "sha"}, {"L": "not", "pos": "RB", "F": "nt"}], "how'll": [{"F": "how"}, {"L": "will", "pos": "MD", "F": "'ll"}], "'s": [{"L": "'s", "F": "'s"}], "whens": [{"F": "when"}, {"F": "s"}], ";p": [{"F": ";p"}], "Youll": [{"L": "-PRON-", "F": "You"}, {"L": "will", "pos": "MD", "F": "ll"}], "Wheres": [{"F": "Where"}, {"F": "s"}], ":p": [{"F": ":p"}], ":-P": [{"F": ":-P"}], "Dr.": [{"F": "Dr."}], "they'd": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "'d"}], "Whatre": [{"F": "What"}, {"F": "re"}], ";-)": [{"F": ";-)"}], "N.D.": [{"F": "N.D."}], "I'ma": [{"L": "-PRON-", "F": "I"}, {"F": "'ma"}], "N.H.": [{"F": "N.H."}], "Wasn't": [{"F": "Was"}, {"L": "not", "pos": "RB", "F": "n't"}], "itdve": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Didnt": [{"L": "do", "pos": "VBD", "F": "Did"}, {"L": "not", "pos": "RB", "F": "nt"}], "Ark.": [{"F": "Ark."}], ":>": [{"F": ":>"}], "Wouldntve": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "6p.m.": [{"F": "6"}, {"F": "p.m."}], "where'd": [{"F": "where"}, {"L": "would", "pos": "MD", "F": "'d"}], ":))": [{"F": ":))"}], ":/": [{"F": ":/"}], "1pm": [{"F": "1"}, {"L": "p.m.", "F": "pm"}], "should've": [{"F": "should"}, {"L": "have", "pos": "VB", "F": "'ve"}], "2am": [{"F": "2"}, {"L": "a.m.", "F": "am"}], "ain't": [{"L": "be", "pos": "VBP", "F": "ai", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "Nov.": [{"F": "Nov."}], "didnt": [{"L": "do", "pos": "VBD", "F": "did"}, {"L": "not", "pos": "RB", "F": "nt"}], "4a.m.": [{"F": "4"}, {"F": "a.m."}], "co.": [{"F": "co."}], "i.": [{"F": "i."}], "when's": [{"F": "when"}, {"F": "'s"}], "wouldntve": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "mightve": [{"F": "might"}, {"L": "have", "pos": "VB", "F": "ve"}], "howll": [{"F": "how"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadn't": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "n't"}], "I'd've": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Feb.": [{"F": "Feb."}], "howd": [{"F": "how"}, {"L": "would", "pos": "MD", "F": "d"}], "it'd've": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "isn't": [{"L": "be", "pos": "VBZ", "F": "is"}, {"L": "not", "pos": "RB", "F": "n't"}], "weve": [{"F": "we"}, {"L": "have", "pos": "VB", "F": "ve"}], "Sen.": [{"F": "Sen."}], "Whove": [{"F": "Who"}, {"L": "have", "pos": "VB", "F": "ve"}], "Youd": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "d"}], "3a.m.": [{"F": "3"}, {"F": "a.m."}], "Where've": [{"F": "Where"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Shouldn't": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "n't"}], "whats": [{"F": "what"}, {"F": "s"}], "Cannot": [{"L": "can", "pos": "MD", "F": "Can"}, {"L": "not", "pos": "RB", "F": "not"}], "You'd've": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "What'll": [{"F": "What"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Thats": [{"F": "That"}, {"F": "s"}], "o_O": [{"F": "o_O"}], "Whats": [{"F": "What"}, {"F": "s"}], "What're": [{"F": "What"}, {"F": "'re"}], "xDD": [{"F": "xDD"}], "3pm": [{"F": "3"}, {"L": "p.m.", "F": "pm"}], "Who're": [{"F": "Who"}, {"F": "'re"}], "mustve": [{"F": "must"}, {"L": "have", "pos": "VB", "F": "ve"}], ":-/": [{"F": ":-/"}], "Apr.": [{"F": "Apr."}], "ima": [{"L": "-PRON-", "F": "i"}, {"F": "ma"}], "Whens": [{"F": "When"}, {"F": "s"}], "Kan.": [{"F": "Kan."}], "w.": [{"F": "w."}], "3p.m.": [{"F": "3"}, {"F": "p.m."}], "Whyre": [{"F": "Why"}, {"F": "re"}], "-_-": [{"F": "-_-"}], "12pm": [{"F": "12"}, {"L": "p.m.", "F": "pm"}], "Ltd.": [{"F": "Ltd."}], "wasnt": [{"F": "was"}, {"L": "not", "pos": "RB", "F": "nt"}], "Shedve": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Nebr.": [{"F": "Nebr."}], "o.": [{"F": "o."}], ";D": [{"F": ";D"}], "whys": [{"F": "why"}, {"F": "s"}], "Tenn.": [{"F": "Tenn."}], "She'd": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "'d"}], "Needn't": [{"F": "Need"}, {"L": "not", "pos": "RB", "F": "n't"}], "Hadnt": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "nt"}], "m.": [{"F": "m."}], "arent": [{"L": "be", "pos": "VBP", "F": "are", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "Arent": [{"L": "be", "pos": "VBP", "F": "Are", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "<33": [{"F": "<33"}], " ": [{"pos": "SP", "F": " "}], "you've": [{"L": "-PRON-", "F": "you"}, {"L": "have", "pos": "VB", "F": "'ve"}], "mightn't've": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Aug.": [{"F": "Aug."}], "=3": [{"F": "=3"}], "Miss.": [{"F": "Miss."}], "Jul.": [{"F": "Jul."}], "Werent": [{"F": "Were"}, {"L": "not", "pos": "RB", "F": "nt"}], "You'd": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "'d"}], "How's": [{"F": "How"}, {"F": "'s"}], "2a.m.": [{"F": "2"}, {"F": "a.m."}], "youre": [{"L": "-PRON-", "F": "you"}, {"F": "re"}], "hadn't've": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "12p.m.": [{"F": "12"}, {"F": "p.m."}], "Im": [{"L": "-PRON-", "F": "I"}, {"L": "be", "F": "m", "pos": "VBP", "tenspect": 1, "number": 1}], "not've": [{"L": "not", "pos": "RB", "F": "not"}, {"L": "have", "pos": "VB", "F": "'ve"}], "thats": [{"F": "that"}, {"F": "s"}], "Mustnt": [{"F": "Must"}, {"L": "not", "pos": "RB", "F": "nt"}], "what're": [{"F": "what"}, {"F": "'re"}], "How'll": [{"F": "How"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Conn.": [{"F": "Conn."}], "it's": [{"L": "-PRON-", "F": "it"}, {"F": "'s"}], "Can't": [{"L": "can", "pos": "MD", "F": "Ca"}, {"L": "not", "pos": "RB", "F": "n't"}], "'ol": [{"F": "'ol"}], "Mustve": [{"F": "Must"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldn't": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "n't"}], "Okla.": [{"F": "Okla."}], "what'll": [{"F": "what"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Whys": [{"F": "Why"}, {"F": "s"}], "it'll": [{"L": "-PRON-", "F": "it"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Mt.": [{"L": "Mount", "F": "Mt."}], "Itdve": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "couldve": [{"pos": "MD", "F": "could"}, {"L": "have", "pos": "VB", "F": "ve"}], "wholl": [{"F": "who"}, {"L": "will", "pos": "MD", "F": "ll"}], "I've": [{"L": "-PRON-", "F": "I"}, {"L": "have", "pos": "VB", "F": "'ve"}], "thered": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "d"}], "Theyll": [{"L": "-PRON-", "F": "They"}, {"L": "will", "pos": "MD", "F": "ll"}], "Neb.": [{"F": "Neb."}], "Who'll": [{"F": "Who"}, {"L": "will", "pos": "MD", "F": "'ll"}], "cannot": [{"L": "can", "pos": "MD", "F": "can"}, {"L": "not", "pos": "RB", "F": "not"}], ":(": [{"F": ":("}], "xD": [{"F": "xD"}], "10pm": [{"F": "10"}, {"L": "p.m.", "F": "pm"}], "couldnt": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "nt"}], "Would've": [{"F": "Would"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mightn't": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "n't"}], "5p.m.": [{"F": "5"}, {"F": "p.m."}], "youve": [{"L": "-PRON-", "F": "you"}, {"L": "have", "pos": "VB", "F": "ve"}], ":Y": [{"F": ":Y"}], "shedve": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "why's": [{"F": "why"}, {"F": "'s"}], "could've": [{"pos": "MD", "F": "could"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Neednt": [{"F": "Need"}, {"L": "not", "pos": "RB", "F": "nt"}], "vs.": [{"F": "vs."}], "Mont.": [{"F": "Mont."}], "Adm.": [{"F": "Adm."}], "Md.": [{"F": "Md."}], "That's": [{"F": "That"}, {"F": "'s"}], "Mar.": [{"F": "Mar."}], "they'll": [{"L": "-PRON-", "F": "they"}, {"L": "will", "pos": "MD", "F": "'ll"}], "b.": [{"F": "b."}], "Sep.": [{"F": "Sep."}], "whod": [{"F": "who"}, {"L": "would", "pos": "MD", "F": "d"}], "2pm": [{"F": "2"}, {"L": "p.m.", "F": "pm"}], "whyll": [{"F": "why"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadnt": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "nt"}], "There'd've": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "He'd": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "'d"}], "theyre": [{"L": "-PRON-", "F": "they"}, {"F": "re"}], "Ms.": [{"F": "Ms."}], "there'd've": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "5a.m.": [{"F": "5"}, {"F": "a.m."}], "7am": [{"F": "7"}, {"L": "a.m.", "F": "am"}], "they'd've": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mich.": [{"F": "Mich."}], "cant": [{"L": "can", "pos": "MD", "F": "ca"}, {"L": "not", "pos": "RB", "F": "nt"}], "Va.": [{"F": "Va."}], "11pm": [{"F": "11"}, {"L": "p.m.", "F": "pm"}], "youll": [{"L": "-PRON-", "F": "you"}, {"L": "will", "pos": "MD", "F": "ll"}], "Isn't": [{"L": "be", "pos": "VBZ", "F": "Is"}, {"L": "not", "pos": "RB", "F": "n't"}], "i'd've": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Hadn't": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "n't"}], "why'll": [{"F": "why"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Jr.": [{"F": "Jr."}], "whove": [{"F": "who"}, {"L": "have", "pos": "VB", "F": "ve"}], "we'd've": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Youve": [{"L": "-PRON-", "F": "You"}, {"L": "have", "pos": "VB", "F": "ve"}], "He'll": [{"L": "-PRON-", "F": "He"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Wedve": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "9am": [{"F": "9"}, {"L": "a.m.", "F": "am"}], "Hed": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "d"}], "whatve": [{"F": "what"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ore.": [{"F": "Ore."}], "(:": [{"F": "(:"}], "Shouldnt": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "nt"}], "Wash.": [{"F": "Wash."}], "Weve": [{"F": "We"}, {"L": "have", "pos": "VB", "F": "ve"}], "N.J.": [{"F": "N.J."}], "Shouldntve": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "h.": [{"F": "h."}], "we'll": [{"F": "we"}, {"L": "will", "pos": "MD", "F": "'ll"}], "we've": [{"F": "we"}, {"L": "have", "pos": "VB", "F": "'ve"}], "doesnt": [{"L": "do", "pos": "VBZ", "F": "does"}, {"L": "not", "pos": "RB", "F": "nt"}], "who's": [{"F": "who"}, {"F": "'s"}], "he'd": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "'d"}], "Ain't": [{"L": "be", "pos": "VBP", "F": "Ai", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "I'd": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "'d"}], "theredve": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "She'll": [{"L": "-PRON-", "F": "She"}, {"L": "will", "pos": "MD", "F": "'ll"}], "They'd": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "'d"}], "\")": [{"F": "\")"}], "Couldve": [{"pos": "MD", "F": "Could"}, {"L": "have", "pos": "VB", "F": "ve"}], "Whyll": [{"F": "Why"}, {"L": "will", "pos": "MD", "F": "ll"}], "y.": [{"F": "y."}], "12a.m.": [{"F": "12"}, {"F": "a.m."}], "wouldnt": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "nt"}], "<3": [{"F": "<3"}], "\n": [{"pos": "SP", "F": "\n"}], "Whered": [{"F": "Where"}, {"L": "would", "pos": "MD", "F": "d"}], "I'm": [{"L": "-PRON-", "F": "I"}, {"L": "be", "F": "'m", "pos": "VBP", "tenspect": 1, "number": 1}], "Couldntve": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ive": [{"L": "-PRON-", "F": "I"}, {"L": "have", "pos": "VB", "F": "ve"}], "i'd": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "'d"}], "youd": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "d"}], "There'd": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "'d"}], "He's": [{"L": "-PRON-", "F": "He"}, {"F": "'s"}], "Mightntve": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "When's": [{"F": "When"}, {"F": "'s"}], "doesn't": [{"L": "do", "pos": "VBZ", "F": "does"}, {"L": "not", "pos": "RB", "F": "n't"}], "=[[": [{"F": "=[["}], "Youre": [{"L": "-PRON-", "F": "You"}, {"F": "re"}], "=]": [{"F": "=]"}], "You'll": [{"L": "-PRON-", "F": "You"}, {"L": "will", "pos": "MD", "F": "'ll"}], "=)": [{"F": "=)"}], "Pa.": [{"F": "Pa."}], "he'd've": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "You've": [{"L": "-PRON-", "F": "You"}, {"L": "have", "pos": "VB", "F": "'ve"}], "They'll": [{"L": "-PRON-", "F": "They"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Ky.": [{"F": "Ky."}], "c.": [{"F": "c."}], "I.E.": [{"F": "I.E."}], "V_V": [{"F": "V_V"}], "Didn't": [{"L": "do", "pos": "VBD", "F": "Did"}, {"L": "not", "pos": "RB", "F": "n't"}], "What've": [{"F": "What"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Weren't": [{"F": "Were"}, {"L": "not", "pos": "RB", "F": "n't"}], ":]": [{"F": ":]"}], "Notve": [{"L": "not", "pos": "RB", "F": "Not"}, {"L": "have", "pos": "VB", "F": "ve"}], "9a.m.": [{"F": "9"}, {"F": "a.m."}], "7pm": [{"F": "7"}, {"L": "p.m.", "F": "pm"}], "Sept.": [{"F": "Sept."}], "Bros.": [{"F": "Bros."}], "Howd": [{"F": "How"}, {"L": "would", "pos": "MD", "F": "d"}], "weren't": [{"F": "were"}, {"L": "not", "pos": "RB", "F": "n't"}], "Why's": [{"F": "Why"}, {"F": "'s"}], ":((": [{"F": ":(("}], "theyve": [{"L": "-PRON-", "F": "they"}, {"L": "have", "pos": "VB", "F": "ve"}], "where's": [{"F": "where"}, {"F": "'s"}], "ive": [{"L": "-PRON-", "F": "i"}, {"L": "have", "pos": "VB", "F": "ve"}], "=D": [{"F": "=D"}], "what've": [{"F": "what"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Whos": [{"F": "Who"}, {"F": "s"}], ":O": [{"F": ":O"}], "Shouldn't've": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "whatre": [{"F": "what"}, {"F": "re"}], "Wouldn't've": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "aren't": [{"L": "be", "pos": "VBP", "F": "are", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], ":)": [{"F": ":)"}], "They'd've": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}]} \ No newline at end of file +{ + "d.": [ + { + "F": "d." + } + ], + "Theydve": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + ":/": [ + { + "F": ":/" + } + ], + "shouldn't've": [ + { + "F": "should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "10a.m.": [ + { + "F": "10" + }, + { + "F": "a.m." + } + ], + "E.G.": [ + { + "F": "E.G." + } + ], + "howll": [ + { + "F": "how" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "6a.m.": [ + { + "F": "6" + }, + { + "F": "a.m." + } + ], + "Ore.": [ + { + "F": "Ore." + } + ], + "Hadn't've": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + ":>": [ + { + "F": ":>" + } + ], + "3p.m.": [ + { + "F": "3" + }, + { + "F": "p.m." + } + ], + "who'll": [ + { + "F": "who" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "5a.m.": [ + { + "F": "5" + }, + { + "F": "a.m." + } + ], + ":(": [ + { + "F": ":(" + } + ], + ":0": [ + { + "F": ":0" + } + ], + ":)": [ + { + "F": ":)" + } + ], + "aint": [ + { + "F": "ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + " ": [ + { + "pos": "SP", + "F": " " + } + ], + "Dec.": [ + { + "F": "Dec." + } + ], + "Shouldnt": [ + { + "F": "Should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Ky.": [ + { + "F": "Ky." + } + ], + "when's": [ + { + "F": "when" + }, + { + "F": "'s" + } + ], + "Didnt": [ + { + "F": "Did", + "L": "do", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "itll": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "Who're": [ + { + "F": "Who" + }, + { + "F": "'re" + } + ], + "=D": [ + { + "F": "=D" + } + ], + "Ain't": [ + { + "F": "Ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Can't": [ + { + "F": "Ca", + "L": "can", + "pos": "MD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Whyre": [ + { + "F": "Why" + }, + { + "F": "re" + } + ], + "Aren't": [ + { + "F": "Are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Neednt": [ + { + "F": "Need" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "should've": [ + { + "F": "should" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "shouldn't": [ + { + "F": "should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Idve": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "weve": [ + { + "F": "we" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Va.": [ + { + "F": "Va." + } + ], + "D.C.": [ + { + "F": "D.C." + } + ], + "3am": [ + { + "F": "3" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Ive": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Md.": [ + { + "F": "Md." + } + ], + ";D": [ + { + "F": ";D" + } + ], + "Mrs.": [ + { + "F": "Mrs." + } + ], + "Minn.": [ + { + "F": "Minn." + } + ], + "they'd": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Youdve": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "theyve": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Weren't": [ + { + "F": "Were" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "werent": [ + { + "F": "were" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whyre": [ + { + "F": "why" + }, + { + "F": "re" + } + ], + "g.": [ + { + "F": "g." + } + ], + "I'm": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "pos": "VBP", + "F": "'m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + ":p": [ + { + "F": ":p" + } + ], + "She'd've": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "not've": [ + { + "F": "not", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "we'll": [ + { + "F": "we" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + ":O": [ + { + "F": ":O" + } + ], + "<33": [ + { + "F": "<33" + } + ], + "Don't": [ + { + "L": "do", + "F": "Do" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Whyll": [ + { + "F": "Why" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "''": [ + { + "F": "''" + } + ], + "they've": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "t.": [ + { + "F": "t." + } + ], + "wasn't": [ + { + "F": "was" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "could've": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "what've": [ + { + "F": "what" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "havent": [ + { + "pos": "VB", + "F": "have" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Who've": [ + { + "F": "Who" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + ":P": [ + { + "F": ":P" + } + ], + "Shan't": [ + { + "F": "Sha" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "i'll": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "i.e.": [ + { + "F": "i.e." + } + ], + "you'd": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "w.": [ + { + "F": "w." + } + ], + "whens": [ + { + "F": "when" + }, + { + "F": "s" + } + ], + "whys": [ + { + "F": "why" + }, + { + "F": "s" + } + ], + "6pm": [ + { + "F": "6" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "4p.m.": [ + { + "F": "4" + }, + { + "F": "p.m." + } + ], + "Whereve": [ + { + "F": "Where" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "o_o": [ + { + "F": "o_o" + } + ], + "Mo.": [ + { + "F": "Mo." + } + ], + "Kan.": [ + { + "F": "Kan." + } + ], + "there'd": [ + { + "F": "there" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "N.H.": [ + { + "F": "N.H." + } + ], + "(^_^)": [ + { + "F": "(^_^)" + } + ], + "Mont.": [ + { + "F": "Mont." + } + ], + "hadn't've": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "whatll": [ + { + "F": "what" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "wouldn't've": [ + { + "F": "would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "there's": [ + { + "F": "there" + }, + { + "F": "'s" + } + ], + "Who'll": [ + { + "F": "Who" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "o_O": [ + { + "F": "o_O" + } + ], + "Nev.": [ + { + "F": "Nev." + } + ], + "youll": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "wouldve": [ + { + "F": "would" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Nov.": [ + { + "F": "Nov." + } + ], + "z.": [ + { + "F": "z." + } + ], + "xDD": [ + { + "F": "xDD" + } + ], + "Sen.": [ + { + "F": "Sen." + } + ], + "Wouldnt": [ + { + "F": "Would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Thered": [ + { + "F": "There" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Youre": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "re" + } + ], + "Couldn't've": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "who're": [ + { + "F": "who" + }, + { + "F": "'re" + } + ], + "Whys": [ + { + "F": "Why" + }, + { + "F": "s" + } + ], + "mightn't've": [ + { + "F": "might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Wholl": [ + { + "F": "Who" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "hadn't": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Havent": [ + { + "pos": "VB", + "F": "Have" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Whatve": [ + { + "F": "What" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "2pm": [ + { + "F": "2" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "o.O": [ + { + "F": "o.O" + } + ], + "Thats": [ + { + "F": "That" + }, + { + "F": "s" + } + ], + "Gov.": [ + { + "F": "Gov." + } + ], + "Howll": [ + { + "F": "How" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "p.": [ + { + "F": "p." + } + ], + "wouldn't": [ + { + "F": "would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "9pm": [ + { + "F": "9" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "You'll": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Ala.": [ + { + "F": "Ala." + } + ], + "12am": [ + { + "F": "12" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "=]": [ + { + "F": "=]" + } + ], + "Cant": [ + { + "F": "Ca", + "L": "can", + "pos": "MD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "i'd": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "a.m.": [ + { + "F": "a.m." + } + ], + "weren't": [ + { + "F": "were" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "would've": [ + { + "F": "would" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "i'm": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "pos": "VBP", + "F": "'m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + "why'll": [ + { + "F": "why" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "we'd've": [ + { + "F": "we" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Shouldve": [ + { + "F": "Should" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "can't": [ + { + "F": "ca", + "L": "can", + "pos": "MD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "thats": [ + { + "F": "that" + }, + { + "F": "s" + } + ], + "1p.m.": [ + { + "F": "1" + }, + { + "F": "p.m." + } + ], + "12a.m.": [ + { + "F": "12" + }, + { + "F": "a.m." + } + ], + "Hes": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "s" + } + ], + "Needn't": [ + { + "F": "Need" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "It's": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'s" + } + ], + "St.": [ + { + "F": "St." + } + ], + "Why're": [ + { + "F": "Why" + }, + { + "F": "'re" + } + ], + ":(((": [ + { + "F": ":(((" + } + ], + "Hed": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Mt.": [ + { + "L": "Mount", + "F": "Mt." + } + ], + "couldn't": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "What've": [ + { + "F": "What" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "4a.m.": [ + { + "F": "4" + }, + { + "F": "a.m." + } + ], + "Ind.": [ + { + "F": "Ind." + } + ], + "It'd": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "<3": [ + { + "F": "<3" + } + ], + "theydve": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "aren't": [ + { + "F": "are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Mightn't": [ + { + "F": "Might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "'S": [ + { + "L": "'s", + "F": "'S" + } + ], + "I've": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whered": [ + { + "F": "Where" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Itdve": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "I'ma": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'ma" + } + ], + "whos": [ + { + "F": "who" + }, + { + "F": "s" + } + ], + "They'd": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "What'll": [ + { + "F": "What" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + ":Y": [ + { + "F": ":Y" + } + ], + "You've": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Mustve": [ + { + "F": "Must" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "whod": [ + { + "F": "who" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "mightntve": [ + { + "F": "might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "I'd've": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Must've": [ + { + "F": "Must" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "it'd": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Ark.": [ + { + "F": "Ark." + } + ], + "Wis.": [ + { + "F": "Wis." + } + ], + "6p.m.": [ + { + "F": "6" + }, + { + "F": "p.m." + } + ], + "what're": [ + { + "F": "what" + }, + { + "F": "'re" + } + ], + "N.C.": [ + { + "F": "N.C." + } + ], + "Wasn't": [ + { + "F": "Was" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "what's": [ + { + "F": "what" + }, + { + "F": "'s" + } + ], + "he'd've": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Jan.": [ + { + "F": "Jan." + } + ], + "She'd": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "shedve": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Tenn.": [ + { + "F": "Tenn." + } + ], + "ain't": [ + { + "F": "ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "She's": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'s" + } + ], + "i'd've": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "2a.m.": [ + { + "F": "2" + }, + { + "F": "a.m." + } + ], + "We'd've": [ + { + "F": "We" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "must've": [ + { + "F": "must" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "That's": [ + { + "F": "That" + }, + { + "F": "'s" + } + ], + "Sept.": [ + { + "F": "Sept." + } + ], + "whatre": [ + { + "F": "what" + }, + { + "F": "re" + } + ], + "you'd've": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Dont": [ + { + "L": "do", + "F": "Do" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "i.": [ + { + "F": "i." + } + ], + "Jun.": [ + { + "F": "Jun." + } + ], + "thered": [ + { + "F": "there" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Youd": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "couldn't've": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whens": [ + { + "F": "When" + }, + { + "F": "s" + } + ], + "8a.m.": [ + { + "F": "8" + }, + { + "F": "a.m." + } + ], + "Isnt": [ + { + "F": "Is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "mightve": [ + { + "F": "might" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "'ol": [ + { + "F": "'ol" + } + ], + "2p.m.": [ + { + "F": "2" + }, + { + "F": "p.m." + } + ], + "9a.m.": [ + { + "F": "9" + }, + { + "F": "a.m." + } + ], + "q.": [ + { + "F": "q." + } + ], + "didnt": [ + { + "F": "did", + "L": "do", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "ive": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "It'd've": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "e.g.": [ + { + "F": "e.g." + } + ], + ":]": [ + { + "F": ":]" + } + ], + "\t": [ + { + "pos": "SP", + "F": "\t" + } + ], + "Mich.": [ + { + "F": "Mich." + } + ], + "Itll": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "didn't": [ + { + "F": "did", + "L": "do", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "3pm": [ + { + "F": "3" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "Jul.": [ + { + "F": "Jul." + } + ], + "7pm": [ + { + "F": "7" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "cant": [ + { + "F": "ca", + "L": "can", + "pos": "MD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Miss.": [ + { + "F": "Miss." + } + ], + "im": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "pos": "VBP", + "F": "m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + "Ariz.": [ + { + "F": "Ariz." + } + ], + "they'd've": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "f.": [ + { + "F": "f." + } + ], + "Co.": [ + { + "F": "Co." + } + ], + "Hadntve": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Weve": [ + { + "F": "We" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "1a.m.": [ + { + "F": "1" + }, + { + "F": "a.m." + } + ], + "=3": [ + { + "F": "=3" + } + ], + "Mightnt": [ + { + "F": "Might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "1pm": [ + { + "F": "1" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "youdve": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Shedve": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "theyd": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Ill.": [ + { + "F": "Ill." + } + ], + "N.D.": [ + { + "F": "N.D." + } + ], + "Cannot": [ + { + "F": "Can", + "L": "can", + "pos": "MD" + }, + { + "F": "not", + "L": "not", + "pos": "RB" + } + ], + "s.": [ + { + "F": "s." + } + ], + "Hadn't": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "What're": [ + { + "F": "What" + }, + { + "F": "'re" + } + ], + "He'll": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "wholl": [ + { + "F": "who" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "They're": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'re" + } + ], + "Neb.": [ + { + "F": "Neb." + } + ], + "shouldnt": [ + { + "F": "should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "\n": [ + { + "pos": "SP", + "F": "\n" + } + ], + "whered": [ + { + "F": "where" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "7a.m.": [ + { + "F": "7" + }, + { + "F": "a.m." + } + ], + "youve": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "4am": [ + { + "F": "4" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "v.": [ + { + "F": "v." + } + ], + "notve": [ + { + "F": "not", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "couldve": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "mustve": [ + { + "F": "must" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Youve": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "might've": [ + { + "F": "might" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Mustn't": [ + { + "F": "Must" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "wheres": [ + { + "F": "where" + }, + { + "F": "s" + } + ], + "they're": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'re" + } + ], + "idve": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "hows": [ + { + "F": "how" + }, + { + "F": "s" + } + ], + "Fla.": [ + { + "F": "Fla." + } + ], + "N.M.": [ + { + "F": "N.M." + } + ], + "youre": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "re" + } + ], + "Didn't": [ + { + "F": "Did", + "L": "do", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Couldve": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "10p.m.": [ + { + "F": "10" + }, + { + "F": "p.m." + } + ], + "Del.": [ + { + "F": "Del." + } + ], + "Oct.": [ + { + "F": "Oct." + } + ], + "Rep.": [ + { + "F": "Rep." + } + ], + "cannot": [ + { + "F": "can", + "L": "can", + "pos": "MD" + }, + { + "F": "not", + "L": "not", + "pos": "RB" + } + ], + "Im": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "pos": "VBP", + "F": "m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + "howd": [ + { + "F": "how" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Okla.": [ + { + "F": "Okla." + } + ], + "Feb.": [ + { + "F": "Feb." + } + ], + "you've": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "You're": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'re" + } + ], + "she'll": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Theyll": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "don't": [ + { + "L": "do", + "F": "do" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "itd": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + ":-)": [ + { + "F": ":-)" + } + ], + "Hedve": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "isnt": [ + { + "F": "is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "won't": [ + { + "F": "wo" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "We're": [ + { + "F": "We" + }, + { + "F": "'re" + } + ], + "^_^": [ + { + "F": "^_^" + } + ], + "I.e.": [ + { + "F": "I.e." + } + ], + "9p.m.": [ + { + "F": "9" + }, + { + "F": "p.m." + } + ], + "dont": [ + { + "L": "do", + "F": "do" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "ima": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "ma" + } + ], + "he's": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'s" + } + ], + "we've": [ + { + "F": "we" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "What's": [ + { + "F": "What" + }, + { + "F": "'s" + } + ], + "Who's": [ + { + "F": "Who" + }, + { + "F": "'s" + } + ], + "-__-": [ + { + "F": "-__-" + } + ], + "hedve": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "he'd": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "When's": [ + { + "F": "When" + }, + { + "F": "'s" + } + ], + "Mightn't've": [ + { + "F": "Might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "We've": [ + { + "F": "We" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Wash.": [ + { + "F": "Wash." + } + ], + "Couldntve": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Who'd": [ + { + "F": "Who" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + ":-/": [ + { + "F": ":-/" + } + ], + "haven't": [ + { + "pos": "VB", + "F": "have" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Gen.": [ + { + "F": "Gen." + } + ], + "(:": [ + { + "F": "(:" + } + ], + "arent": [ + { + "F": "are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "You'd've": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "c.": [ + { + "F": "c." + } + ], + "(=": [ + { + "F": "(=" + } + ], + "Wouldn't": [ + { + "F": "Would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "who's": [ + { + "F": "who" + }, + { + "F": "'s" + } + ], + "12p.m.": [ + { + "F": "12" + }, + { + "F": "p.m." + } + ], + "5am": [ + { + "F": "5" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Mightve": [ + { + "F": "Might" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + ":((": [ + { + "F": ":((" + } + ], + "theredve": [ + { + "F": "there" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Messrs.": [ + { + "F": "Messrs." + } + ], + "who'd": [ + { + "F": "who" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Where's": [ + { + "F": "Where" + }, + { + "F": "'s" + } + ], + "wont": [ + { + "F": "wo" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "she'd've": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "10pm": [ + { + "F": "10" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "Corp.": [ + { + "F": "Corp." + } + ], + "Aug.": [ + { + "F": "Aug." + } + ], + "-_-": [ + { + "F": "-_-" + } + ], + "y.": [ + { + "F": "y." + } + ], + "Should've": [ + { + "F": "Should" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "11pm": [ + { + "F": "11" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "8am": [ + { + "F": "8" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "theyre": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "re" + } + ], + "l.": [ + { + "F": "l." + } + ], + "Wouldntve": [ + { + "F": "Would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Ga.": [ + { + "F": "Ga." + } + ], + "1am": [ + { + "F": "1" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Where've": [ + { + "F": "Where" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "11a.m.": [ + { + "F": "11" + }, + { + "F": "a.m." + } + ], + "mustn't": [ + { + "F": "must" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "isn't": [ + { + "F": "is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Bros.": [ + { + "F": "Bros." + } + ], + "Aint": [ + { + "F": "Ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "why's": [ + { + "F": "why" + }, + { + "F": "'s" + } + ], + "V_V": [ + { + "F": "V_V" + } + ], + ";p": [ + { + "F": ";p" + } + ], + "There'd": [ + { + "F": "There" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "They'll": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "=)": [ + { + "F": "=)" + } + ], + "b.": [ + { + "F": "b." + } + ], + "how'll": [ + { + "F": "how" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Wedve": [ + { + "F": "We" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "couldntve": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "12pm": [ + { + "F": "12" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "There's": [ + { + "F": "There" + }, + { + "F": "'s" + } + ], + "we'd": [ + { + "F": "we" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Dr.": [ + { + "F": "Dr." + } + ], + "Whod": [ + { + "F": "Who" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + ":-P": [ + { + "F": ":-P" + } + ], + "whatve": [ + { + "F": "what" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Wouldve": [ + { + "F": "Would" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "o.": [ + { + "F": "o." + } + ], + ":')": [ + { + "F": ":')" + } + ], + "needn't": [ + { + "F": "need" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "shouldntve": [ + { + "F": "should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "why're": [ + { + "F": "why" + }, + { + "F": "'re" + } + ], + "p.m.": [ + { + "F": "p.m." + } + ], + "Doesnt": [ + { + "F": "Does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whereve": [ + { + "F": "where" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "they'll": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "I'd": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Might've": [ + { + "F": "Might" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "mightnt": [ + { + "F": "might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Kans.": [ + { + "F": "Kans." + } + ], + "Not've": [ + { + "F": "Not", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "e.": [ + { + "F": "e." + } + ], + "mightn't": [ + { + "F": "might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "you're": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'re" + } + ], + "Mar.": [ + { + "F": "Mar." + } + ], + "They've": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "\")": [ + { + "F": "\")" + } + ], + "what'll": [ + { + "F": "what" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Calif.": [ + { + "F": "Calif." + } + ], + "Could've": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Would've": [ + { + "F": "Would" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + ";)": [ + { + "F": ";)" + } + ], + ";(": [ + { + "F": ";(" + } + ], + "Isn't": [ + { + "F": "Is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "let's": [ + { + "F": "let" + }, + { + "F": "'s" + } + ], + "'em": [ + { + "F": "'em" + } + ], + "She'll": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "I.E.": [ + { + "F": "I.E." + } + ], + "You'd": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "wouldnt": [ + { + "F": "would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "6am": [ + { + "F": "6" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "11am": [ + { + "F": "11" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Why'll": [ + { + "F": "Why" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Where'd": [ + { + "F": "Where" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Theyre": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "re" + } + ], + "11p.m.": [ + { + "F": "11" + }, + { + "F": "p.m." + } + ], + "Won't": [ + { + "F": "Wo" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Couldn't": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "it's": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'s" + } + ], + "r.": [ + { + "F": "r." + } + ], + "it'll": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "They'd've": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Ima": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "ma" + } + ], + "5pm": [ + { + "F": "5" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "10am": [ + { + "F": "10" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "m.": [ + { + "F": "m." + } + ], + "whats": [ + { + "F": "what" + }, + { + "F": "s" + } + ], + "How's": [ + { + "F": "How" + }, + { + "F": "'s" + } + ], + "Sep.": [ + { + "F": "Sep." + } + ], + "Shouldntve": [ + { + "F": "Should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "youd": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Whatll": [ + { + "F": "What" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "Wouldn't've": [ + { + "F": "Would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "How'd": [ + { + "F": "How" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "doesnt": [ + { + "F": "does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "h.": [ + { + "F": "h." + } + ], + "Shouldn't": [ + { + "F": "Should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "He'd've": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Mightntve": [ + { + "F": "Might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "couldnt": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Haven't": [ + { + "pos": "VB", + "F": "Have" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "<333": [ + { + "F": "<333" + } + ], + "doesn't": [ + { + "F": "does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Hasn't": [ + { + "F": "Has" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "how's": [ + { + "F": "how" + }, + { + "F": "'s" + } + ], + "hes": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "s" + } + ], + "=[[": [ + { + "F": "=[[" + } + ], + "xD": [ + { + "F": "xD" + } + ], + "he'll": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "hed": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "7p.m.": [ + { + "F": "7" + }, + { + "F": "p.m." + } + ], + "how'd": [ + { + "F": "how" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "u.": [ + { + "F": "u." + } + ], + "we're": [ + { + "F": "we" + }, + { + "F": "'re" + } + ], + "vs.": [ + { + "F": "vs." + } + ], + "Hadnt": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Shant": [ + { + "F": "Sha" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Theyve": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Hows": [ + { + "F": "How" + }, + { + "F": "s" + } + ], + "We'll": [ + { + "F": "We" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "N.Y.": [ + { + "F": "N.Y." + } + ], + "x.": [ + { + "F": "x." + } + ], + "8p.m.": [ + { + "F": "8" + }, + { + "F": "p.m." + } + ], + "i've": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whove": [ + { + "F": "Who" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "2am": [ + { + "F": "2" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "La.": [ + { + "F": "La." + } + ], + "i'ma": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'ma" + } + ], + "N.J.": [ + { + "F": "N.J." + } + ], + "Nebr.": [ + { + "F": "Nebr." + } + ], + "Howd": [ + { + "F": "How" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "hadnt": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "shant": [ + { + "F": "sha" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "There'd've": [ + { + "F": "There" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Inc.": [ + { + "F": "Inc." + } + ], + "I'll": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Why's": [ + { + "F": "Why" + }, + { + "F": "'s" + } + ], + "Adm.": [ + { + "F": "Adm." + } + ], + "Shouldn't've": [ + { + "F": "Should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "n.": [ + { + "F": "n." + } + ], + "Wasnt": [ + { + "F": "Was" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whove": [ + { + "F": "who" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + ";-p": [ + { + "F": ";-p" + } + ], + "hasn't": [ + { + "F": "has" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "wouldntve": [ + { + "F": "would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Wheres": [ + { + "F": "Where" + }, + { + "F": "s" + } + ], + "How'll": [ + { + "F": "How" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "there'd've": [ + { + "F": "there" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whos": [ + { + "F": "Who" + }, + { + "F": "s" + } + ], + "shes": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "s" + } + ], + "Doesn't": [ + { + "F": "Does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Arent": [ + { + "F": "Are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Hasnt": [ + { + "F": "Has" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "j.": [ + { + "F": "j." + } + ], + "He's": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'s" + } + ], + "wasnt": [ + { + "F": "was" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whyll": [ + { + "F": "why" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "co.": [ + { + "F": "co." + } + ], + "mustnt": [ + { + "F": "must" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "He'd": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "3a.m.": [ + { + "F": "3" + }, + { + "F": "a.m." + } + ], + "Shes": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "s" + } + ], + "where've": [ + { + "F": "where" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Youll": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "Apr.": [ + { + "F": "Apr." + } + ], + "Conn.": [ + { + "F": "Conn." + } + ], + "8pm": [ + { + "F": "8" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "9am": [ + { + "F": "9" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "hasnt": [ + { + "F": "has" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "theyll": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "it'd've": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "itdve": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Jr.": [ + { + "F": "Jr." + } + ], + "Rev.": [ + { + "F": "Rev." + } + ], + "k.": [ + { + "F": "k." + } + ], + "wedve": [ + { + "F": "we" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Let's": [ + { + "F": "Let" + }, + { + "F": "'s" + } + ], + "Colo.": [ + { + "F": "Colo." + } + ], + "Mr.": [ + { + "F": "Mr." + } + ], + "Werent": [ + { + "F": "Were" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Theredve": [ + { + "F": "There" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "shan't": [ + { + "F": "sha" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + ";-)": [ + { + "F": ";-)" + } + ], + "Wont": [ + { + "F": "Wo" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "hadntve": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "who've": [ + { + "F": "who" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whatre": [ + { + "F": "What" + }, + { + "F": "re" + } + ], + "'s": [ + { + "L": "'s", + "F": "'s" + } + ], + "where'd": [ + { + "F": "where" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "shouldve": [ + { + "F": "should" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "a.": [ + { + "F": "a." + } + ], + "where's": [ + { + "F": "where" + }, + { + "F": "'s" + } + ], + "Ltd.": [ + { + "F": "Ltd." + } + ], + "Mass.": [ + { + "F": "Mass." + } + ], + "neednt": [ + { + "F": "need" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Pa.": [ + { + "F": "Pa." + } + ], + "It'll": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "7am": [ + { + "F": "7" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "We'd": [ + { + "F": "We" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Whats": [ + { + "F": "What" + }, + { + "F": "s" + } + ], + "\u2014": [ + { + "pos": ":", + "L": "--", + "F": "\u2014" + } + ], + "E.g.": [ + { + "F": "E.g." + } + ], + "Ms.": [ + { + "F": "Ms." + } + ], + ":3": [ + { + "F": ":3" + } + ], + "5p.m.": [ + { + "F": "5" + }, + { + "F": "p.m." + } + ], + "Itd": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "May.": [ + { + "F": "May." + } + ], + "she'd": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Mustnt": [ + { + "F": "Must" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Notve": [ + { + "F": "Not", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "you'll": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Theyd": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "she's": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'s" + } + ], + "Couldnt": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "that's": [ + { + "F": "that" + }, + { + "F": "'s" + } + ], + "4pm": [ + { + "F": "4" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + ":))": [ + { + "F": ":))" + } + ] +} \ No newline at end of file From caff4638c986411c2c7b6de0230fd53fa862cad3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 11:08:12 +0200 Subject: [PATCH 42/62] * Fix website/test_api.py for Python 3 --- tests/website/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/website/test_api.py b/tests/website/test_api.py index ef0365d88..c173c2b74 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -80,7 +80,7 @@ def test_read_bytes(nlp): file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] - with open(loc) as file_: + with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2 From 1f90502ce8fec29786ae7c65de2e0e5391bdd931 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 11:08:31 +0200 Subject: [PATCH 43/62] * Fix website/test_home for Python 3 --- tests/website/test_home.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 6c97b0f31..4da61becf 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -17,7 +17,7 @@ def test_load_resources_and_process_text(): @pytest.mark.models def test_get_tokens_and_sentences(doc): token = doc[0] - sentence = doc.sents.next() + sentence = next(doc.sents) assert token is sentence[0] assert sentence.text == 'Hello, world.' From f35632e2e584ca6979dd8efac8868602c056b04c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 11:08:58 +0200 Subject: [PATCH 44/62] * Remove SBD print statement in train, after SBD evaluation was removed from Scorer --- bin/parser/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index f2e153c29..c1f81af33 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -229,7 +229,6 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) - print('SBD', scorer.sbd_acc) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) From 5682439d1e8780802d426aa5ac0e7da104fa521c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 20:24:21 +1100 Subject: [PATCH 45/62] * Remove em dash test from test_lemmatizer, as em dashes are now handled in specials.json --- tests/tagger/test_lemmatizer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index df553c3d6..2dec62c4a 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -41,8 +41,3 @@ def test_smart_quotes(lemmatizer): do = lemmatizer.punct assert do('“') == set(['``']) assert do('“') == set(['``']) - - -def test_smart_quotes(lemmatizer): - do = lemmatizer.punct - assert do('–') == set(["--"]) From 599f739ddb43a73c1466144734ea706bdf1bf6bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 20:51:28 +1100 Subject: [PATCH 46/62] * Fix smart quote lemma test --- tests/tagger/test_lemmatizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index 2dec62c4a..ff10b6573 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -39,5 +39,5 @@ def test_noun_lemmas(lemmatizer): def test_smart_quotes(lemmatizer): do = lemmatizer.punct - assert do('“') == set(['``']) - assert do('“') == set(['``']) + assert do('“') == set(['"']) + assert do('“') == set(['"']) From 3bf50ab83036eec076e4df07ff56a3d7730f81c4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 20:57:47 +1100 Subject: [PATCH 47/62] * Ensure the fabfile prebuild command installs pytest --- fabfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabfile.py b/fabfile.py index 953c02e00..b7ef6f18f 100644 --- a/fabfile.py +++ b/fabfile.py @@ -47,7 +47,7 @@ def prebuild(build_dir='/tmp/build_spacy'): local('git clone %s .' % spacy_dir) local('virtualenv ' + build_venv) with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)): - local('pip install cython fabric fabtools') + local('pip install cython fabric fabtools pytest') local('pip install -r requirements.txt') local('fab clean make') local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir) From af8d0a2a0901bb8faf77f5a74209fa102f261995 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 12:42:41 +0200 Subject: [PATCH 48/62] * Increment version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0c05d890b..e386925b6 100644 --- a/setup.py +++ b/setup.py @@ -134,7 +134,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.93' +VERSION = '0.94' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] From a3dfe2b90128fcbb549400c390f27ca01fede09b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 13:26:17 +0200 Subject: [PATCH 49/62] * Increment data version --- spacy/en/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 20e7b5b95..01c87a4e4 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -7,7 +7,7 @@ import wget import plac # TODO: Read this from the same source as the setup -VERSION = '0.9.0' +VERSION = '0.9.1' AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' From 876fc99c44f674c61ec4a43e4cd5173a6ea2e3d3 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 9 Oct 2015 16:11:56 +0200 Subject: [PATCH 50/62] cleanup looks like this file was accidentally added --- website/src/jade/home/_installation.jade~ | 83 ----------------------- 1 file changed, 83 deletions(-) delete mode 100644 website/src/jade/home/_installation.jade~ diff --git a/website/src/jade/home/_installation.jade~ b/website/src/jade/home/_installation.jade~ deleted file mode 100644 index 9b6b4fa3f..000000000 --- a/website/src/jade/home/_installation.jade~ +++ /dev/null @@ -1,83 +0,0 @@ -mixin Option(name, open) - details(open=open) - summary - h4= name - block - -article.post - header - h2 #[a(href=Meta.url) - - p What's new in v0.90? - - .subhead by #[a(href="//twitter.com/spacy_io", rel="author" target="_blank") #{spaCy}] on #[time #{getDate(Meta.date).fulldate}] - - ul - li Support for gazetteers - li Set Lexeme attributes - #[a.readmore(href=Meta.url) Full Change Log ►] - - -section.intro - p What's - -+Option("conda", true) - pre.language-bash: code - | $ conda install spacy - | $ python -m spacy.en.download - -+Option("pip and virtualenv", true) - p With Python 2.7 or Python 3, using Linux or OSX, run: - - pre.language-bash: code - | $ pip install spacy - | $ python -m spacy.en.download - - p - | The download command fetches and installs about 300mb of data, for - | the parser model and word vectors, which it installs within the spacy.en - | package directory. - - - +Option("Workaround for obsolete system Python", false) - p - | If you're stuck using a server with an old version of Python, and you - | don't have root access, I've prepared a bootstrap script to help you - | compile a local Python install. Run: - - pre.language-bash: code - | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate - - - -+Option("Compile from source", false) - p - | The other way to install the package is to clone the github repository, - | and build it from source. This installs an additional dependency, - | Cython. If you're using Python 2, I also recommend installing fabric - | and fabtools – this is how I build the project. - - pre.language-bash: code - | $ git clone https://github.com/honnibal/spaCy.git - | $ cd spaCy - | $ virtualenv .env && source .env/bin/activate - | $ export PYTHONPATH=`pwd` - | $ pip install -r requirements.txt - | $ python setup.py build_ext --inplace - | $ python -m spacy.en.download - | $ pip install pytest - | $ py.test tests/ - - p - | Python packaging is awkward at the best of times, and it's particularly tricky - | with C extensions, built via Cython, requiring large data files. So, - | please report issues as you encounter them. - -+Option("pypy (Unsupported)") - | If PyPy support is a priority for you, please get in touch. We could likely - | fix the remaining issues, if necessary. However, the library is likely to - | be much slower on PyPy, as it's written in Cython, which produces code tuned - | for the performance of CPython. - -+Option("Windows (Unsupported)") - | Unfortunately we don't currently support Windows. From 88b2f7ea5d51a57d3644ce2508b9d9c26913aead Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 9 Oct 2015 16:30:23 +0200 Subject: [PATCH 51/62] push version and add spacy channel --- website/src/jade/home/_installation.jade | 5 +++++ website/src/jade/home/index.jade | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/website/src/jade/home/_installation.jade b/website/src/jade/home/_installation.jade index 7a9a14bd5..c0e0b1445 100644 --- a/website/src/jade/home/_installation.jade +++ b/website/src/jade/home/_installation.jade @@ -20,6 +20,11 @@ mixin Option(name, open) | $ conda install spacy | $ python -m spacy.en.download all + p Latest stable conda packages are available from the spacy channel: + + pre.language-bash: code + | $ conda install -c https://conda.anaconda.org/spacy spacy + +Option("pip and virtualenv", true) p With Python 2.7 or Python 3, using Linux or OSX, ensure that you have the following packages installed: diff --git a/website/src/jade/home/index.jade b/website/src/jade/home/index.jade index f95f4fd53..89635b180 100644 --- a/website/src/jade/home/index.jade +++ b/website/src/jade/home/index.jade @@ -29,7 +29,7 @@ include ../header.jade li: a.button(href="#example-use") Examples li: a.button(href="#install") | Install - v0.93 + v0.94 article.page.landing-page +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade") From 7a47c0c872f3886c08d3abcb8dd92ef654019817 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 9 Oct 2015 16:37:57 +0200 Subject: [PATCH 52/62] push version --- website/src/jade/home/index.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/jade/home/index.jade b/website/src/jade/home/index.jade index 89635b180..a77dd323c 100644 --- a/website/src/jade/home/index.jade +++ b/website/src/jade/home/index.jade @@ -35,4 +35,4 @@ include ../header.jade +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade") +Section("Online Demo", "online-demo", "./_online_demo.jade") +Section("Usage by Example", "example-use", "./_usage_examples.jade") - +Section("Install v0.93", "install", "./_installation.jade") + +Section("Install v0.94", "install", "./_installation.jade") From 7e7f28e1fd57ea3c0877d14cbf8f11b1dc397296 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 14:06:09 +1100 Subject: [PATCH 53/62] * Add smart-quote possessive marker in generate_specials --- lang_data/en/generate_specials.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index db3827593..9ebd94a52 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -115,6 +115,8 @@ hardcoded_specials = { "'s": [{"F": "'s", "L": "'s"}], "'S": [{"F": "'S", "L": "'s"}], + u"\u2018s": [{"F": u"\u2018s", "L": "'s"}], + u"\u2018S": [{"F": u"\u2018S", "L": "'s"}], "'em": [{"F": "'em"}], From 57b3cd466163d747efde130e18f343f3c7314597 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 14:06:46 +1100 Subject: [PATCH 54/62] * Add smart-quotes to lemma rules --- lang_data/en/lemma_rules.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index 1d7366f92..30a19be50 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -31,6 +31,8 @@ "punct": [ ["“", "\""], - ["”", "\""] + ["”", "\""], + ['\u2018', "'"], + ['\u2019', "'"] ] } From 30de4135c98b092c953fd2c16a6e47fce2d17fc6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 14:22:32 +1100 Subject: [PATCH 55/62] * Fix merge problem --- spacy/strings.pyx | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a247fa6a8..4b47f5a82 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,9 +1,5 @@ -<<<<<<< HEAD from __future__ import unicode_literals -import codecs -======= import io ->>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd from libc.string cimport memcpy from murmurhash.mrmr cimport hash64 @@ -133,25 +129,15 @@ cdef class StringStore: def dump(self, loc): cdef Utf8Str* string -<<<<<<< HEAD cdef unicode py_string cdef int i - with codecs.open(loc, 'w', 'utf8') as file_: + with io.open(loc, 'w', 'utf8') as file_: for i in range(1, self.size): string = &self.c[i] py_string = _decode(string) file_.write(py_string) if (i+1) != self.size: file_.write(SEPARATOR) -======= - cdef bytes py_string - for i in range(self.size): - string = &self.strings[i] - py_string = string.chars[:string.length] - strings.append(py_string.decode('utf8')) - with io.open(loc, 'w', encoding='utf8') as file_: - file_.write(SEPARATOR.join(strings)) ->>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd def load(self, loc): with io.open(loc, 'r', encoding='utf8') as file_: From 2153067958de3062abcae7f0b41dd54ec89a09f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:03:12 +1100 Subject: [PATCH 56/62] * Fix use of io in strings.pyx --- spacy/strings.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 4b47f5a82..29a8a47a8 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -131,7 +131,7 @@ cdef class StringStore: cdef Utf8Str* string cdef unicode py_string cdef int i - with io.open(loc, 'w', 'utf8') as file_: + with io.open(loc, 'w', encoding='utf8') as file_: for i in range(1, self.size): string = &self.c[i] py_string = _decode(string) From c12d36d5f4694996b01b959bc16092f9848bde92 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:03:36 +1100 Subject: [PATCH 57/62] * Fix quote marks in lemma_rules --- lang_data/en/lemma_rules.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index 30a19be50..1e76436cd 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -32,7 +32,7 @@ "punct": [ ["“", "\""], ["”", "\""], - ['\u2018', "'"], - ['\u2019', "'"] + ["\u2018", "'"], + ["\u2019", "'"] ] } From 1521cf25c95273b09a602101c6d2d31496c64d2d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:04:01 +1100 Subject: [PATCH 58/62] * Fix merge problem in test_parse_navigate --- tests/parser/test_parse_navigate.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/parser/test_parse_navigate.py b/tests/parser/test_parse_navigate.py index 8c76199f4..1771dbeba 100644 --- a/tests/parser/test_parse_navigate.py +++ b/tests/parser/test_parse_navigate.py @@ -7,11 +7,7 @@ import pytest @pytest.fixture def sun_text(): -<<<<<<< HEAD:tests/parser/test_parse_navigate.py - with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_: -======= with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_: ->>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd:tests/test_parse_navigate.py text = file_.read() return text From 8b39feefbed39ef66aae08bc6cf1ecd6d402dd2e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:32:13 +1100 Subject: [PATCH 59/62] * Add dependency post-process rule to ensure spaces are attached to neighbouring tokens, so that they can't be sentence boundaries --- spacy/syntax/arc_eager.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 265018920..07595d4ab 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,7 +9,8 @@ from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC -from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE +from ..lexeme cimport Lexeme from libc.stdint cimport uint32_t from libc.string cimport memcpy @@ -380,7 +381,10 @@ cdef class ArcEager(TransitionSystem): cdef int finalize_state(self, StateClass st) nogil: for i in range(st.length): - if st._sent[i].head == 0 and st._sent[i].dep == 0: + # Always attach spaces to the previous word + if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): + st._sent[i].head = -1 if (i >= 1) else 1 + elif st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = self.root_label # If we're not using the Break transition, we segment via root-labelled # arcs between the root words. From 9dd2f25c7438c81f7122f9de28f4d35e1e6b0911 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:53:30 +1100 Subject: [PATCH 60/62] * Fix Issue #131: Force whitespace characters to attach syntactically to previous token, and ensure they cannot serve as stand-alone 'sentence' units. --- spacy/syntax/arc_eager.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 07595d4ab..561308928 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -380,10 +380,17 @@ cdef class ArcEager(TransitionSystem): st.fast_forward() cdef int finalize_state(self, StateClass st) nogil: + cdef int i for i in range(st.length): # Always attach spaces to the previous word if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): st._sent[i].head = -1 if (i >= 1) else 1 + if st._sent[i].sent_start and st._sent[i].head == -1: + st._sent[i].sent_start = False + # If we had this space token as the start of a sentence, + # move that sentence start forward one + if (i + 1) < st.length and not st._sent[i+1].sent_start: + st._sent[i+1].sent_start = True elif st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = self.root_label # If we're not using the Break transition, we segment via root-labelled From bdcb8d695c7d012726501f87da9a38faca269024 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:54:06 +1100 Subject: [PATCH 61/62] * Add non-breaking space to specials.json --- lang_data/en/generate_specials.py | 3 +- lang_data/en/specials.json | 67 ++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 9ebd94a52..7c642c7c4 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -341,7 +341,8 @@ hardcoded_specials = { "E.G.": [{"F": "E.G."}], "\n": [{"F": "\n", "pos": "SP"}], "\t": [{"F": "\t", "pos": "SP"}], - " ": [{"F": " ", "pos": "SP"}] + " ": [{"F": " ", "pos": "SP"}], + u"\xa0": [{"F": u"\xa0", "pos": "SP", "L": " "}] } diff --git a/lang_data/en/specials.json b/lang_data/en/specials.json index 20d90e261..4cb44bb74 100644 --- a/lang_data/en/specials.json +++ b/lang_data/en/specials.json @@ -605,9 +605,13 @@ "pos": "VB" } ], - ":P": [ + "11am": [ { - "F": ":P" + "F": "11" + }, + { + "L": "a.m.", + "F": "am" } ], "Shan't": [ @@ -710,6 +714,13 @@ "F": "Kan." } ], + "\u00a0": [ + { + "pos": "SP", + "L": " ", + "F": "\u00a0" + } + ], "there'd": [ { "F": "there" @@ -1624,6 +1635,11 @@ "pos": "RB" } ], + "Wash.": [ + { + "F": "Wash." + } + ], "She's": [ { "L": "-PRON-", @@ -1885,11 +1901,6 @@ "F": "e.g." } ], - ":]": [ - { - "F": ":]" - } - ], "\t": [ { "pos": "SP", @@ -2581,14 +2592,23 @@ "F": "'re" } ], + "3a.m.": [ + { + "F": "3" + }, + { + "F": "a.m." + } + ], "^_^": [ { "F": "^_^" } ], - "I.e.": [ + "\u2018S": [ { - "F": "I.e." + "L": "'s", + "F": "\u2018S" } ], "9p.m.": [ @@ -2719,9 +2739,10 @@ "pos": "VB" } ], - "Wash.": [ + "\u2018s": [ { - "F": "Wash." + "L": "'s", + "F": "\u2018s" } ], "Couldntve": [ @@ -3249,9 +3270,9 @@ "F": "o." } ], - ":')": [ + ":]": [ { - "F": ":')" + "F": ":]" } ], "needn't": [ @@ -3535,13 +3556,9 @@ "F": "am" } ], - "11am": [ + ":P": [ { - "F": "11" - }, - { - "L": "a.m.", - "F": "am" + "F": ":P" } ], "Why'll": [ @@ -4363,12 +4380,9 @@ "pos": "MD" } ], - "3a.m.": [ + "I.e.": [ { - "F": "3" - }, - { - "F": "a.m." + "F": "I.e." } ], "Shes": [ @@ -4406,6 +4420,11 @@ "F": "Apr." } ], + ":')": [ + { + "F": ":')" + } + ], "Conn.": [ { "F": "Conn." From dfbcff2ff1b992c34ebbead140060adb3839d1d3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Oct 2015 15:54:55 +1100 Subject: [PATCH 62/62] * Revert codecs/io change to strings.pyx, as it seemed to cause an error? Will investigate. --- spacy/strings.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 29a8a47a8..a4a470158 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,5 +1,5 @@ from __future__ import unicode_literals -import io +import codecs from libc.string cimport memcpy from murmurhash.mrmr cimport hash64 @@ -131,7 +131,7 @@ cdef class StringStore: cdef Utf8Str* string cdef unicode py_string cdef int i - with io.open(loc, 'w', encoding='utf8') as file_: + with codecs.open(loc, 'w', 'utf8') as file_: for i in range(1, self.size): string = &self.c[i] py_string = _decode(string) @@ -140,7 +140,7 @@ cdef class StringStore: file_.write(SEPARATOR) def load(self, loc): - with io.open(loc, 'r', encoding='utf8') as file_: + with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) if strings == ['']: return None