diff --git a/.gitignore b/.gitignore index 70eddf717..bc24f41b7 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,4 @@ coverage.xml # Sphinx documentation docs/_build/ docs/_themes/ +setup.py diff --git a/.travis.yml b/.travis.yml index 1ea1f8375..f21301db1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ install: # run tests script: - - "py.test tests/ website/tests/ -x" + - "py.test tests/ -x" diff --git a/bin/init_model.py b/bin/init_model.py index 599cd3083..72d7a3aae 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -27,8 +27,8 @@ from pathlib import Path from shutil import copyfile from shutil import copytree -import codecs from collections import defaultdict +import io from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors @@ -61,7 +61,7 @@ def _read_clusters(loc): print("Warning: Clusters file not found") return {} clusters = {} - for line in codecs.open(str(loc), 'r', 'utf8'): + for line in io.open(str(loc), 'r', encoding='utf8'): try: cluster, word, freq = line.split() except ValueError: @@ -88,7 +88,7 @@ def _read_probs(loc): print("Probabilities file not found. Trying freqs.") return {}, 0.0 probs = {} - for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): + for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')): prob, word = line.split() prob = float(prob) probs[word] = prob diff --git a/bin/ner_tag.py b/bin/ner_tag.py index 34588bd12..f990f21a1 100644 --- a/bin/ner_tag.py +++ b/bin/ner_tag.py @@ -1,11 +1,11 @@ -import codecs +import io import plac from spacy.en import English def main(text_loc): - with codecs.open(text_loc, 'r', 'utf8') as file_: + with io.open(text_loc, 'r', encoding='utf8') as file_: text = file_.read() NLU = English() for paragraph in text.split('\n\n'): diff --git a/bin/parser/train.py b/bin/parser/train.py index f2e153c29..0a9d34ffc 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -6,7 +6,7 @@ from __future__ import print_function import os from os import path import shutil -import codecs +import io import random import plac @@ -177,7 +177,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc): nlp = Language(data_dir=model_dir) gold_tuples = read_json_file(dev_loc) scorer = Scorer() - out_file = codecs.open(out_loc, 'w', 'utf8') + out_file = io.open(out_loc, 'w', 'utf8') for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: @@ -229,7 +229,6 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) - print('SBD', scorer.sbd_acc) print('NER P', scorer.ents_p) print('NER R', scorer.ents_r) diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py index d13ef7130..f9f4eec21 100644 --- a/bin/prepare_treebank.py +++ b/bin/prepare_treebank.py @@ -27,7 +27,7 @@ import json from os import path import os import re -import codecs +import io from collections import defaultdict from spacy.munge import read_ptb @@ -122,7 +122,7 @@ def read_file(*pieces): if not path.exists(loc): return None else: - return codecs.open(loc, 'r', 'utf8').read().strip() + return io.open(loc, 'r', encoding='utf8').read().strip() def get_file_names(section_dir, subsection): diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py index 06cc313a9..73f48bf42 100644 --- a/examples/multi_word_matches.py +++ b/examples/multi_word_matches.py @@ -22,74 +22,77 @@ our pattern set stays very small (exact size depends on the maximum length we're looking for, as the query language currently has no quantifiers) """ from __future__ import print_function, unicode_literals, division +from ast import literal_eval +from bz2 import BZ2File +import time +import math +import codecs import plac from preshed.maps import PreshMap +from preshed.counter import PreshCounter from spacy.strings import hash_string from spacy.en import English -from spacy.matcher import Matcher - -from spacy.attrs import FLAG63 as U_ENT -from spacy.attrs import FLAG62 as L_ENT -from spacy.attrs import FLAG61 as I_ENT -from spacy.attrs import FLAG60 as B_ENT +from spacy.matcher import PhraseMatcher -def get_bilou(length): - if length == 1: - return [U_ENT] - else: - return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT] +def read_gazetteer(tokenizer, loc, n=-1): + for i, line in enumerate(open(loc)): + phrase = literal_eval('u' + line.strip()) + if ' (' in phrase and phrase.endswith(')'): + phrase = phrase.split(' (', 1)[0] + if i >= n: + break + phrase = tokenizer(phrase) + if all((t.is_lower and t.prob >= -10) for t in phrase): + continue + if len(phrase) >= 2: + yield phrase -def make_matcher(vocab, max_length): - abstract_patterns = [] - for length in range(1, max_length+1): - abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) - return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)}) +def read_text(bz2_loc): + with BZ2File(bz2_loc) as file_: + for line in file_: + yield line.decode('utf8') -def get_matches(matcher, pattern_ids, doc): - matches = [] - for label, start, end in matcher(doc): - candidate = doc[start : end] - if pattern_ids[hash_string(candidate.text)] == True: - start = candidate[0].idx - end = candidate[-1].idx + len(candidate[-1]) - matches.append((start, end, candidate.root.tag_, candidate.text)) - return matches +def get_matches(tokenizer, phrases, texts, max_length=6): + matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length) + print("Match") + for text in texts: + doc = tokenizer(text) + matches = matcher(doc) + for mwe in doc.ents: + yield mwe -def merge_matches(doc, matches): - for start, end, tag, text in matches: - doc.merge(start, end, tag, text, 'MWE') - - -def main(): +def main(patterns_loc, text_loc, counts_loc, n=10000000): nlp = English(parser=False, tagger=False, entity=False) - - gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] - example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' - pattern_ids = PreshMap() - max_length = 0 - for pattern_str in gazetteer: - pattern = nlp.tokenizer(pattern_str) - bilou_tags = get_bilou(len(pattern)) - for word, tag in zip(pattern, bilou_tags): - lexeme = nlp.vocab[word.orth] - lexeme.set_flag(tag, True) - pattern_ids[hash_string(pattern.text)] = True - max_length = max(max_length, len(pattern)) - - matcher = make_matcher(nlp.vocab, max_length) - - doc = nlp(example_text) - matches = get_matches(matcher, pattern_ids, doc) - merge_matches(doc, matches) - for token in doc: - print(token.text, token.ent_type_) + print("Make matcher") + phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n) + counts = PreshCounter() + t1 = time.time() + for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)): + counts.inc(hash_string(mwe.text), 1) + t2 = time.time() + print("10m tokens in %d s" % (t2 - t1)) + + with codecs.open(counts_loc, 'w', 'utf8') as file_: + for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n): + text = phrase.string + key = hash_string(text) + count = counts[key] + if count != 0: + file_.write('%d\t%s\n' % (count, text)) if __name__ == '__main__': - plac.call(main) + if False: + import cProfile + import pstats + cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") + s = pstats.Stats("Profile.prof") + s.strip_dirs().sort_stats("time").print_stats() + else: + plac.call(main) diff --git a/fabfile.py b/fabfile.py index 953c02e00..b7ef6f18f 100644 --- a/fabfile.py +++ b/fabfile.py @@ -47,7 +47,7 @@ def prebuild(build_dir='/tmp/build_spacy'): local('git clone %s .' % spacy_dir) local('virtualenv ' + build_venv) with prefix('cd %s && PYTHONPATH=`pwd` && . %s/bin/activate' % (build_dir, build_venv)): - local('pip install cython fabric fabtools') + local('pip install cython fabric fabtools pytest') local('pip install -r requirements.txt') local('fab clean make') local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir) diff --git a/lang_data/en/generate_specials.py b/lang_data/en/generate_specials.py index 1a8f1ae0b..7c642c7c4 100644 --- a/lang_data/en/generate_specials.py +++ b/lang_data/en/generate_specials.py @@ -1,3 +1,4 @@ +# -#- coding: utf-8 -*- import json contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"} @@ -114,6 +115,8 @@ hardcoded_specials = { "'s": [{"F": "'s", "L": "'s"}], "'S": [{"F": "'S", "L": "'s"}], + u"\u2018s": [{"F": u"\u2018s", "L": "'s"}], + u"\u2018S": [{"F": u"\u2018S", "L": "'s"}], "'em": [{"F": "'em"}], @@ -133,6 +136,8 @@ hardcoded_specials = { "''": [{"F": "''"}], + "—": [{"F": "—", "L": "--", "pos": ":"}], + "Corp.": [{"F": "Corp."}], "Inc.": [{"F": "Inc."}], "Co.": [{"F": "Co."}], @@ -336,7 +341,8 @@ hardcoded_specials = { "E.G.": [{"F": "E.G."}], "\n": [{"F": "\n", "pos": "SP"}], "\t": [{"F": "\t", "pos": "SP"}], - " ": [{"F": " ", "pos": "SP"}] + " ": [{"F": " ", "pos": "SP"}], + u"\xa0": [{"F": u"\xa0", "pos": "SP", "L": " "}] } @@ -412,6 +418,6 @@ def generate_specials(): if __name__ == "__main__": specials = generate_specials() - with open("specials.json", "w") as f: - json.dump(specials, f) + with open("specials.json", "w") as file_: + file_.write(json.dumps(specials, indent=2)) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index c45eb1df6..1e76436cd 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -27,5 +27,12 @@ ["est", ""], ["er", "e"], ["est", "e"] + ], + + "punct": [ + ["“", "\""], + ["”", "\""], + ["\u2018", "'"], + ["\u2019", "'"] ] } diff --git a/lang_data/en/specials.json b/lang_data/en/specials.json index 93672dc10..4cb44bb74 100644 --- a/lang_data/en/specials.json +++ b/lang_data/en/specials.json @@ -1 +1,4882 @@ -{"i've": [{"L": "-PRON-", "F": "i"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Where's": [{"F": "Where"}, {"F": "'s"}], "4p.m.": [{"F": "4"}, {"F": "p.m."}], "12am": [{"F": "12"}, {"L": "a.m.", "F": "am"}], "j.": [{"F": "j."}], "8pm": [{"F": "8"}, {"L": "p.m.", "F": "pm"}], "E.G.": [{"F": "E.G."}], "must've": [{"F": "must"}, {"L": "have", "pos": "VB", "F": "'ve"}], "D.C.": [{"F": "D.C."}], "She'd've": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "mightnt": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "nt"}], "Hes": [{"L": "-PRON-", "F": "He"}, {"F": "s"}], "7a.m.": [{"F": "7"}, {"F": "a.m."}], "Idve": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ill.": [{"F": "Ill."}], ":P": [{"F": ":P"}], "\t": [{"pos": "SP", "F": "\t"}], "10a.m.": [{"F": "10"}, {"F": "a.m."}], "would've": [{"F": "would"}, {"L": "have", "pos": "VB", "F": "'ve"}], "11am": [{"F": "11"}, {"L": "a.m.", "F": "am"}], "you'd": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "'d"}], "Thered": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "d"}], "havent": [{"pos": "VB", "F": "have"}, {"L": "not", "pos": "RB", "F": "nt"}], "im": [{"L": "-PRON-", "F": "i"}, {"L": "be", "F": "m", "pos": "VBP", "tenspect": 1, "number": 1}], "Whatll": [{"F": "What"}, {"L": "will", "pos": "MD", "F": "ll"}], "there'd": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "'d"}], "Mustn't": [{"F": "Must"}, {"L": "not", "pos": "RB", "F": "n't"}], "haven't": [{"pos": "VB", "F": "have"}, {"L": "not", "pos": "RB", "F": "n't"}], "hows": [{"F": "how"}, {"F": "s"}], "Doesn't": [{"L": "do", "pos": "VBZ", "F": "Does"}, {"L": "not", "pos": "RB", "F": "n't"}], "You're": [{"L": "-PRON-", "F": "You"}, {"F": "'re"}], "he's": [{"L": "-PRON-", "F": "he"}, {"F": "'s"}], "Mo.": [{"F": "Mo."}], "Theydve": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "We're": [{"F": "We"}, {"F": "'re"}], "can't": [{"L": "can", "pos": "MD", "F": "ca"}, {"L": "not", "pos": "RB", "F": "n't"}], "they've": [{"L": "-PRON-", "F": "they"}, {"L": "have", "pos": "VB", "F": "'ve"}], "werent": [{"F": "were"}, {"L": "not", "pos": "RB", "F": "nt"}], "i'm": [{"L": "-PRON-", "F": "i"}, {"L": "be", "F": "'m", "pos": "VBP", "tenspect": 1, "number": 1}], "Wouldve": [{"F": "Would"}, {"L": "have", "pos": "VB", "F": "ve"}], "Inc.": [{"F": "Inc."}], "Isnt": [{"L": "be", "pos": "VBZ", "F": "Is"}, {"L": "not", "pos": "RB", "F": "nt"}], "mightn't": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "n't"}], "itd": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "d"}], "^_^": [{"F": "^_^"}], "4pm": [{"F": "4"}, {"L": "p.m.", "F": "pm"}], "theyd": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "d"}], "p.": [{"F": "p."}], "Hasnt": [{"F": "Has"}, {"L": "not", "pos": "RB", "F": "nt"}], "how'd": [{"F": "how"}, {"L": "would", "pos": "MD", "F": "'d"}], "you'll": [{"L": "-PRON-", "F": "you"}, {"L": "will", "pos": "MD", "F": "'ll"}], "how's": [{"F": "how"}, {"F": "'s"}], "e.g.": [{"F": "e.g."}], "didn't": [{"L": "do", "pos": "VBD", "F": "did"}, {"L": "not", "pos": "RB", "F": "n't"}], "6pm": [{"F": "6"}, {"L": "p.m.", "F": "pm"}], "z.": [{"F": "z."}], "Howll": [{"F": "How"}, {"L": "will", "pos": "MD", "F": "ll"}], "Shant": [{"F": "Sha"}, {"L": "not", "pos": "RB", "F": "nt"}], "Theyd": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "d"}], "f.": [{"F": "f."}], "u.": [{"F": "u."}], "she'd": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "'d"}], "Fla.": [{"F": "Fla."}], "Rep.": [{"F": "Rep."}], "they're": [{"L": "-PRON-", "F": "they"}, {"F": "'re"}], "you'd've": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mightve": [{"F": "Might"}, {"L": "have", "pos": "VB", "F": "ve"}], "Why'll": [{"F": "Why"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Should've": [{"F": "Should"}, {"L": "have", "pos": "VB", "F": "'ve"}], "that's": [{"F": "that"}, {"F": "'s"}], "9pm": [{"F": "9"}, {"L": "p.m.", "F": "pm"}], "Mass.": [{"F": "Mass."}], "there's": [{"F": "there"}, {"F": "'s"}], "It'd": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "'d"}], "hasn't": [{"F": "has"}, {"L": "not", "pos": "RB", "F": "n't"}], "shes": [{"L": "-PRON-", "F": "she"}, {"F": "s"}], "she'd've": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "o.O": [{"F": "o.O"}], "whered": [{"F": "where"}, {"L": "would", "pos": "MD", "F": "d"}], ":(((": [{"F": ":((("}], "N.C.": [{"F": "N.C."}], "you're": [{"L": "-PRON-", "F": "you"}, {"F": "'re"}], ":0": [{"F": ":0"}], "Wouldn't": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "n't"}], "'em": [{"F": "'em"}], "Whatve": [{"F": "What"}, {"L": "have", "pos": "VB", "F": "ve"}], "Corp.": [{"F": "Corp."}], "i'ma": [{"L": "-PRON-", "F": "i"}, {"F": "'ma"}], "''": [{"F": "''"}], "v.": [{"F": "v."}], "Ga.": [{"F": "Ga."}], "1am": [{"F": "1"}, {"L": "a.m.", "F": "am"}], "Wasnt": [{"F": "Was"}, {"L": "not", "pos": "RB", "F": "nt"}], "q.": [{"F": "q."}], "Hows": [{"F": "How"}, {"F": "s"}], "why're": [{"F": "why"}, {"F": "'re"}], ";-p": [{"F": ";-p"}], "Ima": [{"L": "-PRON-", "F": "I"}, {"F": "ma"}], "neednt": [{"F": "need"}, {"L": "not", "pos": "RB", "F": "nt"}], "Ariz.": [{"F": "Ariz."}], "8am": [{"F": "8"}, {"L": "a.m.", "F": "am"}], "Aren't": [{"L": "be", "pos": "VBP", "F": "Are", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "4am": [{"F": "4"}, {"L": "a.m.", "F": "am"}], "she'll": [{"L": "-PRON-", "F": "she"}, {"L": "will", "pos": "MD", "F": "'ll"}], "8p.m.": [{"F": "8"}, {"F": "p.m."}], "9p.m.": [{"F": "9"}, {"F": "p.m."}], "11p.m.": [{"F": "11"}, {"F": "p.m."}], "Who'd": [{"F": "Who"}, {"L": "would", "pos": "MD", "F": "'d"}], "St.": [{"F": "St."}], "It's": [{"L": "-PRON-", "F": "It"}, {"F": "'s"}], "Gen.": [{"F": "Gen."}], "Messrs.": [{"F": "Messrs."}], "Calif.": [{"F": "Calif."}], "youdve": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "i'll": [{"L": "-PRON-", "F": "i"}, {"L": "will", "pos": "MD", "F": "'ll"}], "whatll": [{"F": "what"}, {"L": "will", "pos": "MD", "F": "ll"}], "mightntve": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldnt": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "nt"}], "Hasn't": [{"F": "Has"}, {"L": "not", "pos": "RB", "F": "n't"}], "hasnt": [{"F": "has"}, {"L": "not", "pos": "RB", "F": "nt"}], "shouldnt": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "nt"}], "Haven't": [{"pos": "VB", "F": "Have"}, {"L": "not", "pos": "RB", "F": "n't"}], "wedve": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Must've": [{"F": "Must"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Minn.": [{"F": "Minn."}], "s.": [{"F": "s."}], "isnt": [{"L": "be", "pos": "VBZ", "F": "is"}, {"L": "not", "pos": "RB", "F": "nt"}], "He'd've": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "o_o": [{"F": "o_o"}], "let's": [{"F": "let"}, {"F": "'s"}], "They've": [{"L": "-PRON-", "F": "They"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Co.": [{"F": "Co."}], "p.m.": [{"F": "p.m."}], "we're": [{"F": "we"}, {"F": "'re"}], "May.": [{"F": "May."}], "Ala.": [{"F": "Ala."}], "10am": [{"F": "10"}, {"L": "a.m.", "F": "am"}], "itll": [{"L": "-PRON-", "F": "it"}, {"L": "will", "pos": "MD", "F": "ll"}], "n.": [{"F": "n."}], "5pm": [{"F": "5"}, {"L": "p.m.", "F": "pm"}], "hedve": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Shan't": [{"F": "Sha"}, {"L": "not", "pos": "RB", "F": "n't"}], "Wont": [{"F": "Wo"}, {"L": "not", "pos": "RB", "F": "nt"}], "'S": [{"L": "'s", "F": "'S"}], ";(": [{"F": ";("}], "Mightn't've": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "needn't": [{"F": "need"}, {"L": "not", "pos": "RB", "F": "n't"}], "Shes": [{"L": "-PRON-", "F": "She"}, {"F": "s"}], "he'll": [{"L": "-PRON-", "F": "he"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Whereve": [{"F": "Where"}, {"L": "have", "pos": "VB", "F": "ve"}], "aint": [{"L": "be", "pos": "VBP", "F": "ai", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "Colo.": [{"F": "Colo."}], "who've": [{"F": "who"}, {"L": "have", "pos": "VB", "F": "'ve"}], "it'd": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "'d"}], "theyll": [{"L": "-PRON-", "F": "they"}, {"L": "will", "pos": "MD", "F": "ll"}], "wont": [{"F": "wo"}, {"L": "not", "pos": "RB", "F": "nt"}], "whyre": [{"F": "why"}, {"F": "re"}], "Nev.": [{"F": "Nev."}], "Dec.": [{"F": "Dec."}], "whereve": [{"F": "where"}, {"L": "have", "pos": "VB", "F": "ve"}], "Cant": [{"L": "can", "pos": "MD", "F": "Ca"}, {"L": "not", "pos": "RB", "F": "nt"}], "1a.m.": [{"F": "1"}, {"F": "a.m."}], "i.e.": [{"F": "i.e."}], "3am": [{"F": "3"}, {"L": "a.m.", "F": "am"}], "Won't": [{"F": "Wo"}, {"L": "not", "pos": "RB", "F": "n't"}], "hes": [{"L": "-PRON-", "F": "he"}, {"F": "s"}], "Let's": [{"F": "Let"}, {"F": "'s"}], "I'll": [{"L": "-PRON-", "F": "I"}, {"L": "will", "pos": "MD", "F": "'ll"}], "We'll": [{"F": "We"}, {"L": "will", "pos": "MD", "F": "'ll"}], "who'd": [{"F": "who"}, {"L": "would", "pos": "MD", "F": "'d"}], "E.g.": [{"F": "E.g."}], "we'd": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "'d"}], "Theyre": [{"L": "-PRON-", "F": "They"}, {"F": "re"}], "She's": [{"L": "-PRON-", "F": "She"}, {"F": "'s"}], "Whod": [{"F": "Who"}, {"L": "would", "pos": "MD", "F": "d"}], "Itll": [{"L": "-PRON-", "F": "It"}, {"L": "will", "pos": "MD", "F": "ll"}], "couldn't've": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "How'd": [{"F": "How"}, {"L": "would", "pos": "MD", "F": "'d"}], "wouldve": [{"F": "would"}, {"L": "have", "pos": "VB", "F": "ve"}], "shan't": [{"F": "sha"}, {"L": "not", "pos": "RB", "F": "n't"}], "8a.m.": [{"F": "8"}, {"F": "a.m."}], "Havent": [{"pos": "VB", "F": "Have"}, {"L": "not", "pos": "RB", "F": "nt"}], "-__-": [{"F": "-__-"}], "6am": [{"F": "6"}, {"L": "a.m.", "F": "am"}], "Hadntve": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "10p.m.": [{"F": "10"}, {"F": "p.m."}], "Might've": [{"F": "Might"}, {"L": "have", "pos": "VB", "F": "'ve"}], "N.M.": [{"F": "N.M."}], "shouldn't": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "n't"}], "(^_^)": [{"F": "(^_^)"}], "x.": [{"F": "x."}], "where've": [{"F": "where"}, {"L": "have", "pos": "VB", "F": "'ve"}], ";)": [{"F": ";)"}], "theydve": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "dont": [{"L": "do", "F": "do"}, {"L": "not", "pos": "RB", "F": "nt"}], "wouldn't": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "n't"}], "g.": [{"F": "g."}], "Who've": [{"F": "Who"}, {"L": "have", "pos": "VB", "F": "'ve"}], "might've": [{"F": "might"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Who's": [{"F": "Who"}, {"F": "'s"}], "Theyve": [{"L": "-PRON-", "F": "They"}, {"L": "have", "pos": "VB", "F": "ve"}], "2p.m.": [{"F": "2"}, {"F": "p.m."}], "shouldn't've": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "hed": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "d"}], "1p.m.": [{"F": "1"}, {"F": "p.m."}], "We've": [{"F": "We"}, {"L": "have", "pos": "VB", "F": "'ve"}], "a.": [{"F": "a."}], "<333": [{"F": "<333"}], "l.": [{"F": "l."}], "It'll": [{"L": "-PRON-", "F": "It"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Jun.": [{"F": "Jun."}], "Mrs.": [{"F": "Mrs."}], "what's": [{"F": "what"}, {"F": "'s"}], "N.Y.": [{"F": "N.Y."}], "Why're": [{"F": "Why"}, {"F": "'re"}], "Wis.": [{"F": "Wis."}], "Hedve": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Kans.": [{"F": "Kans."}], "idve": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "We'd've": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Dont": [{"L": "do", "F": "Do"}, {"L": "not", "pos": "RB", "F": "nt"}], ":')": [{"F": ":')"}], "(=": [{"F": "(="}], "won't": [{"F": "wo"}, {"L": "not", "pos": "RB", "F": "n't"}], "who'll": [{"F": "who"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Not've": [{"L": "not", "pos": "RB", "F": "Not"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Gov.": [{"F": "Gov."}], "couldntve": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Doesnt": [{"L": "do", "pos": "VBZ", "F": "Does"}, {"L": "not", "pos": "RB", "F": "nt"}], "11a.m.": [{"F": "11"}, {"F": "a.m."}], "I.e.": [{"F": "I.e."}], "wasn't": [{"F": "was"}, {"L": "not", "pos": "RB", "F": "n't"}], "5am": [{"F": "5"}, {"L": "a.m.", "F": "am"}], "Shouldve": [{"F": "Should"}, {"L": "have", "pos": "VB", "F": "ve"}], "Jan.": [{"F": "Jan."}], "she's": [{"L": "-PRON-", "F": "she"}, {"F": "'s"}], "We'd": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "'d"}], "Itd": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "d"}], "What's": [{"F": "What"}, {"F": "'s"}], "e.": [{"F": "e."}], "7p.m.": [{"F": "7"}, {"F": "p.m."}], "Wholl": [{"F": "Who"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadntve": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Where'd": [{"F": "Where"}, {"L": "would", "pos": "MD", "F": "'d"}], ":-)": [{"F": ":-)"}], "whos": [{"F": "who"}, {"F": "s"}], "mustn't": [{"F": "must"}, {"L": "not", "pos": "RB", "F": "n't"}], "shouldntve": [{"F": "should"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Youdve": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "mustnt": [{"F": "must"}, {"L": "not", "pos": "RB", "F": "nt"}], "Oct.": [{"F": "Oct."}], "a.m.": [{"F": "a.m."}], "wouldn't've": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "k.": [{"F": "k."}], "Hadn't've": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "who're": [{"F": "who"}, {"F": "'re"}], "6a.m.": [{"F": "6"}, {"F": "a.m."}], "Rev.": [{"F": "Rev."}], "Del.": [{"F": "Del."}], "Ind.": [{"F": "Ind."}], "couldn't": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "n't"}], "La.": [{"F": "La."}], "It'd've": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "t.": [{"F": "t."}], "don't": [{"L": "do", "F": "do"}, {"L": "not", "pos": "RB", "F": "n't"}], "Mightnt": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "nt"}], ":3": [{"F": ":3"}], "shouldve": [{"F": "should"}, {"L": "have", "pos": "VB", "F": "ve"}], "notve": [{"L": "not", "pos": "RB", "F": "not"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldn't've": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Aint": [{"L": "be", "pos": "VBP", "F": "Ai", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "wheres": [{"F": "where"}, {"F": "s"}], "Don't": [{"L": "do", "F": "Do"}, {"L": "not", "pos": "RB", "F": "n't"}], "Theredve": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Could've": [{"pos": "MD", "F": "Could"}, {"L": "have", "pos": "VB", "F": "'ve"}], "d.": [{"F": "d."}], "Wouldnt": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "nt"}], "They're": [{"L": "-PRON-", "F": "They"}, {"F": "'re"}], "There's": [{"F": "There"}, {"F": "'s"}], "Mr.": [{"F": "Mr."}], "shant": [{"F": "sha"}, {"L": "not", "pos": "RB", "F": "nt"}], "how'll": [{"F": "how"}, {"L": "will", "pos": "MD", "F": "'ll"}], "'s": [{"L": "'s", "F": "'s"}], "whens": [{"F": "when"}, {"F": "s"}], ";p": [{"F": ";p"}], "Youll": [{"L": "-PRON-", "F": "You"}, {"L": "will", "pos": "MD", "F": "ll"}], "Wheres": [{"F": "Where"}, {"F": "s"}], ":p": [{"F": ":p"}], ":-P": [{"F": ":-P"}], "Dr.": [{"F": "Dr."}], "they'd": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "'d"}], "Whatre": [{"F": "What"}, {"F": "re"}], ";-)": [{"F": ";-)"}], "N.D.": [{"F": "N.D."}], "I'ma": [{"L": "-PRON-", "F": "I"}, {"F": "'ma"}], "N.H.": [{"F": "N.H."}], "Wasn't": [{"F": "Was"}, {"L": "not", "pos": "RB", "F": "n't"}], "itdve": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Didnt": [{"L": "do", "pos": "VBD", "F": "Did"}, {"L": "not", "pos": "RB", "F": "nt"}], "Ark.": [{"F": "Ark."}], ":>": [{"F": ":>"}], "Wouldntve": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "6p.m.": [{"F": "6"}, {"F": "p.m."}], "where'd": [{"F": "where"}, {"L": "would", "pos": "MD", "F": "'d"}], ":))": [{"F": ":))"}], ":/": [{"F": ":/"}], "1pm": [{"F": "1"}, {"L": "p.m.", "F": "pm"}], "should've": [{"F": "should"}, {"L": "have", "pos": "VB", "F": "'ve"}], "2am": [{"F": "2"}, {"L": "a.m.", "F": "am"}], "ain't": [{"L": "be", "pos": "VBP", "F": "ai", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "Nov.": [{"F": "Nov."}], "didnt": [{"L": "do", "pos": "VBD", "F": "did"}, {"L": "not", "pos": "RB", "F": "nt"}], "4a.m.": [{"F": "4"}, {"F": "a.m."}], "co.": [{"F": "co."}], "i.": [{"F": "i."}], "when's": [{"F": "when"}, {"F": "'s"}], "wouldntve": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "mightve": [{"F": "might"}, {"L": "have", "pos": "VB", "F": "ve"}], "howll": [{"F": "how"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadn't": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "n't"}], "I'd've": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Feb.": [{"F": "Feb."}], "howd": [{"F": "how"}, {"L": "would", "pos": "MD", "F": "d"}], "it'd've": [{"L": "-PRON-", "F": "it"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "isn't": [{"L": "be", "pos": "VBZ", "F": "is"}, {"L": "not", "pos": "RB", "F": "n't"}], "weve": [{"F": "we"}, {"L": "have", "pos": "VB", "F": "ve"}], "Sen.": [{"F": "Sen."}], "Whove": [{"F": "Who"}, {"L": "have", "pos": "VB", "F": "ve"}], "Youd": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "d"}], "3a.m.": [{"F": "3"}, {"F": "a.m."}], "Where've": [{"F": "Where"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Shouldn't": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "n't"}], "whats": [{"F": "what"}, {"F": "s"}], "Cannot": [{"L": "can", "pos": "MD", "F": "Can"}, {"L": "not", "pos": "RB", "F": "not"}], "You'd've": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "What'll": [{"F": "What"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Thats": [{"F": "That"}, {"F": "s"}], "o_O": [{"F": "o_O"}], "Whats": [{"F": "What"}, {"F": "s"}], "What're": [{"F": "What"}, {"F": "'re"}], "xDD": [{"F": "xDD"}], "3pm": [{"F": "3"}, {"L": "p.m.", "F": "pm"}], "Who're": [{"F": "Who"}, {"F": "'re"}], "mustve": [{"F": "must"}, {"L": "have", "pos": "VB", "F": "ve"}], ":-/": [{"F": ":-/"}], "Apr.": [{"F": "Apr."}], "ima": [{"L": "-PRON-", "F": "i"}, {"F": "ma"}], "Whens": [{"F": "When"}, {"F": "s"}], "Kan.": [{"F": "Kan."}], "w.": [{"F": "w."}], "3p.m.": [{"F": "3"}, {"F": "p.m."}], "Whyre": [{"F": "Why"}, {"F": "re"}], "-_-": [{"F": "-_-"}], "12pm": [{"F": "12"}, {"L": "p.m.", "F": "pm"}], "Ltd.": [{"F": "Ltd."}], "wasnt": [{"F": "was"}, {"L": "not", "pos": "RB", "F": "nt"}], "Shedve": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "Nebr.": [{"F": "Nebr."}], "o.": [{"F": "o."}], ";D": [{"F": ";D"}], "whys": [{"F": "why"}, {"F": "s"}], "Tenn.": [{"F": "Tenn."}], "She'd": [{"L": "-PRON-", "F": "She"}, {"L": "would", "pos": "MD", "F": "'d"}], "Needn't": [{"F": "Need"}, {"L": "not", "pos": "RB", "F": "n't"}], "Hadnt": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "nt"}], "m.": [{"F": "m."}], "arent": [{"L": "be", "pos": "VBP", "F": "are", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "Arent": [{"L": "be", "pos": "VBP", "F": "Are", "number": 2}, {"L": "not", "pos": "RB", "F": "nt"}], "<33": [{"F": "<33"}], " ": [{"pos": "SP", "F": " "}], "you've": [{"L": "-PRON-", "F": "you"}, {"L": "have", "pos": "VB", "F": "'ve"}], "mightn't've": [{"F": "might"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Aug.": [{"F": "Aug."}], "=3": [{"F": "=3"}], "Miss.": [{"F": "Miss."}], "Jul.": [{"F": "Jul."}], "Werent": [{"F": "Were"}, {"L": "not", "pos": "RB", "F": "nt"}], "You'd": [{"L": "-PRON-", "F": "You"}, {"L": "would", "pos": "MD", "F": "'d"}], "How's": [{"F": "How"}, {"F": "'s"}], "2a.m.": [{"F": "2"}, {"F": "a.m."}], "youre": [{"L": "-PRON-", "F": "you"}, {"F": "re"}], "hadn't've": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "12p.m.": [{"F": "12"}, {"F": "p.m."}], "Im": [{"L": "-PRON-", "F": "I"}, {"L": "be", "F": "m", "pos": "VBP", "tenspect": 1, "number": 1}], "not've": [{"L": "not", "pos": "RB", "F": "not"}, {"L": "have", "pos": "VB", "F": "'ve"}], "thats": [{"F": "that"}, {"F": "s"}], "Mustnt": [{"F": "Must"}, {"L": "not", "pos": "RB", "F": "nt"}], "what're": [{"F": "what"}, {"F": "'re"}], "How'll": [{"F": "How"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Conn.": [{"F": "Conn."}], "it's": [{"L": "-PRON-", "F": "it"}, {"F": "'s"}], "Can't": [{"L": "can", "pos": "MD", "F": "Ca"}, {"L": "not", "pos": "RB", "F": "n't"}], "'ol": [{"F": "'ol"}], "Mustve": [{"F": "Must"}, {"L": "have", "pos": "VB", "F": "ve"}], "Couldn't": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "n't"}], "Okla.": [{"F": "Okla."}], "what'll": [{"F": "what"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Whys": [{"F": "Why"}, {"F": "s"}], "it'll": [{"L": "-PRON-", "F": "it"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Mt.": [{"L": "Mount", "F": "Mt."}], "Itdve": [{"L": "-PRON-", "F": "It"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "couldve": [{"pos": "MD", "F": "could"}, {"L": "have", "pos": "VB", "F": "ve"}], "wholl": [{"F": "who"}, {"L": "will", "pos": "MD", "F": "ll"}], "I've": [{"L": "-PRON-", "F": "I"}, {"L": "have", "pos": "VB", "F": "'ve"}], "thered": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "d"}], "Theyll": [{"L": "-PRON-", "F": "They"}, {"L": "will", "pos": "MD", "F": "ll"}], "Neb.": [{"F": "Neb."}], "Who'll": [{"F": "Who"}, {"L": "will", "pos": "MD", "F": "'ll"}], "cannot": [{"L": "can", "pos": "MD", "F": "can"}, {"L": "not", "pos": "RB", "F": "not"}], ":(": [{"F": ":("}], "xD": [{"F": "xD"}], "10pm": [{"F": "10"}, {"L": "p.m.", "F": "pm"}], "couldnt": [{"pos": "MD", "F": "could"}, {"L": "not", "pos": "RB", "F": "nt"}], "Would've": [{"F": "Would"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mightn't": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "n't"}], "5p.m.": [{"F": "5"}, {"F": "p.m."}], "youve": [{"L": "-PRON-", "F": "you"}, {"L": "have", "pos": "VB", "F": "ve"}], ":Y": [{"F": ":Y"}], "shedve": [{"L": "-PRON-", "F": "she"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "why's": [{"F": "why"}, {"F": "'s"}], "could've": [{"pos": "MD", "F": "could"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Neednt": [{"F": "Need"}, {"L": "not", "pos": "RB", "F": "nt"}], "vs.": [{"F": "vs."}], "Mont.": [{"F": "Mont."}], "Adm.": [{"F": "Adm."}], "Md.": [{"F": "Md."}], "That's": [{"F": "That"}, {"F": "'s"}], "Mar.": [{"F": "Mar."}], "they'll": [{"L": "-PRON-", "F": "they"}, {"L": "will", "pos": "MD", "F": "'ll"}], "b.": [{"F": "b."}], "Sep.": [{"F": "Sep."}], "whod": [{"F": "who"}, {"L": "would", "pos": "MD", "F": "d"}], "2pm": [{"F": "2"}, {"L": "p.m.", "F": "pm"}], "whyll": [{"F": "why"}, {"L": "will", "pos": "MD", "F": "ll"}], "hadnt": [{"L": "have", "pos": "VBD", "F": "had"}, {"L": "not", "pos": "RB", "F": "nt"}], "There'd've": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "He'd": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "'d"}], "theyre": [{"L": "-PRON-", "F": "they"}, {"F": "re"}], "Ms.": [{"F": "Ms."}], "there'd've": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "5a.m.": [{"F": "5"}, {"F": "a.m."}], "7am": [{"F": "7"}, {"L": "a.m.", "F": "am"}], "they'd've": [{"L": "-PRON-", "F": "they"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Mich.": [{"F": "Mich."}], "cant": [{"L": "can", "pos": "MD", "F": "ca"}, {"L": "not", "pos": "RB", "F": "nt"}], "Va.": [{"F": "Va."}], "11pm": [{"F": "11"}, {"L": "p.m.", "F": "pm"}], "youll": [{"L": "-PRON-", "F": "you"}, {"L": "will", "pos": "MD", "F": "ll"}], "Isn't": [{"L": "be", "pos": "VBZ", "F": "Is"}, {"L": "not", "pos": "RB", "F": "n't"}], "i'd've": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Hadn't": [{"L": "have", "pos": "VBD", "F": "Had"}, {"L": "not", "pos": "RB", "F": "n't"}], "why'll": [{"F": "why"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Jr.": [{"F": "Jr."}], "whove": [{"F": "who"}, {"L": "have", "pos": "VB", "F": "ve"}], "we'd've": [{"F": "we"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Youve": [{"L": "-PRON-", "F": "You"}, {"L": "have", "pos": "VB", "F": "ve"}], "He'll": [{"L": "-PRON-", "F": "He"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Wedve": [{"F": "We"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "9am": [{"F": "9"}, {"L": "a.m.", "F": "am"}], "Hed": [{"L": "-PRON-", "F": "He"}, {"L": "would", "pos": "MD", "F": "d"}], "whatve": [{"F": "what"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ore.": [{"F": "Ore."}], "(:": [{"F": "(:"}], "Shouldnt": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "nt"}], "Wash.": [{"F": "Wash."}], "Weve": [{"F": "We"}, {"L": "have", "pos": "VB", "F": "ve"}], "N.J.": [{"F": "N.J."}], "Shouldntve": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "h.": [{"F": "h."}], "we'll": [{"F": "we"}, {"L": "will", "pos": "MD", "F": "'ll"}], "we've": [{"F": "we"}, {"L": "have", "pos": "VB", "F": "'ve"}], "doesnt": [{"L": "do", "pos": "VBZ", "F": "does"}, {"L": "not", "pos": "RB", "F": "nt"}], "who's": [{"F": "who"}, {"F": "'s"}], "he'd": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "'d"}], "Ain't": [{"L": "be", "pos": "VBP", "F": "Ai", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], "I'd": [{"L": "-PRON-", "F": "I"}, {"L": "would", "pos": "MD", "F": "'d"}], "theredve": [{"F": "there"}, {"L": "would", "pos": "MD", "F": "d"}, {"L": "have", "pos": "VB", "F": "ve"}], "She'll": [{"L": "-PRON-", "F": "She"}, {"L": "will", "pos": "MD", "F": "'ll"}], "They'd": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "'d"}], "\")": [{"F": "\")"}], "Couldve": [{"pos": "MD", "F": "Could"}, {"L": "have", "pos": "VB", "F": "ve"}], "Whyll": [{"F": "Why"}, {"L": "will", "pos": "MD", "F": "ll"}], "y.": [{"F": "y."}], "12a.m.": [{"F": "12"}, {"F": "a.m."}], "wouldnt": [{"F": "would"}, {"L": "not", "pos": "RB", "F": "nt"}], "<3": [{"F": "<3"}], "\n": [{"pos": "SP", "F": "\n"}], "Whered": [{"F": "Where"}, {"L": "would", "pos": "MD", "F": "d"}], "I'm": [{"L": "-PRON-", "F": "I"}, {"L": "be", "F": "'m", "pos": "VBP", "tenspect": 1, "number": 1}], "Couldntve": [{"pos": "MD", "F": "Could"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "Ive": [{"L": "-PRON-", "F": "I"}, {"L": "have", "pos": "VB", "F": "ve"}], "i'd": [{"L": "-PRON-", "F": "i"}, {"L": "would", "pos": "MD", "F": "'d"}], "youd": [{"L": "-PRON-", "F": "you"}, {"L": "would", "pos": "MD", "F": "d"}], "There'd": [{"F": "There"}, {"L": "would", "pos": "MD", "F": "'d"}], "He's": [{"L": "-PRON-", "F": "He"}, {"F": "'s"}], "Mightntve": [{"F": "Might"}, {"L": "not", "pos": "RB", "F": "nt"}, {"L": "have", "pos": "VB", "F": "ve"}], "When's": [{"F": "When"}, {"F": "'s"}], "doesn't": [{"L": "do", "pos": "VBZ", "F": "does"}, {"L": "not", "pos": "RB", "F": "n't"}], "=[[": [{"F": "=[["}], "Youre": [{"L": "-PRON-", "F": "You"}, {"F": "re"}], "=]": [{"F": "=]"}], "You'll": [{"L": "-PRON-", "F": "You"}, {"L": "will", "pos": "MD", "F": "'ll"}], "=)": [{"F": "=)"}], "Pa.": [{"F": "Pa."}], "he'd've": [{"L": "-PRON-", "F": "he"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}], "You've": [{"L": "-PRON-", "F": "You"}, {"L": "have", "pos": "VB", "F": "'ve"}], "They'll": [{"L": "-PRON-", "F": "They"}, {"L": "will", "pos": "MD", "F": "'ll"}], "Ky.": [{"F": "Ky."}], "c.": [{"F": "c."}], "I.E.": [{"F": "I.E."}], "V_V": [{"F": "V_V"}], "Didn't": [{"L": "do", "pos": "VBD", "F": "Did"}, {"L": "not", "pos": "RB", "F": "n't"}], "What've": [{"F": "What"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Weren't": [{"F": "Were"}, {"L": "not", "pos": "RB", "F": "n't"}], ":]": [{"F": ":]"}], "Notve": [{"L": "not", "pos": "RB", "F": "Not"}, {"L": "have", "pos": "VB", "F": "ve"}], "9a.m.": [{"F": "9"}, {"F": "a.m."}], "7pm": [{"F": "7"}, {"L": "p.m.", "F": "pm"}], "Sept.": [{"F": "Sept."}], "Bros.": [{"F": "Bros."}], "Howd": [{"F": "How"}, {"L": "would", "pos": "MD", "F": "d"}], "weren't": [{"F": "were"}, {"L": "not", "pos": "RB", "F": "n't"}], "Why's": [{"F": "Why"}, {"F": "'s"}], ":((": [{"F": ":(("}], "theyve": [{"L": "-PRON-", "F": "they"}, {"L": "have", "pos": "VB", "F": "ve"}], "where's": [{"F": "where"}, {"F": "'s"}], "ive": [{"L": "-PRON-", "F": "i"}, {"L": "have", "pos": "VB", "F": "ve"}], "=D": [{"F": "=D"}], "what've": [{"F": "what"}, {"L": "have", "pos": "VB", "F": "'ve"}], "Whos": [{"F": "Who"}, {"F": "s"}], ":O": [{"F": ":O"}], "Shouldn't've": [{"F": "Should"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "whatre": [{"F": "what"}, {"F": "re"}], "Wouldn't've": [{"F": "Would"}, {"L": "not", "pos": "RB", "F": "n't"}, {"L": "have", "pos": "VB", "F": "'ve"}], "aren't": [{"L": "be", "pos": "VBP", "F": "are", "number": 2}, {"L": "not", "pos": "RB", "F": "n't"}], ":)": [{"F": ":)"}], "They'd've": [{"L": "-PRON-", "F": "They"}, {"L": "would", "pos": "MD", "F": "'d"}, {"L": "have", "pos": "VB", "F": "'ve"}]} \ No newline at end of file +{ + "d.": [ + { + "F": "d." + } + ], + "Theydve": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + ":/": [ + { + "F": ":/" + } + ], + "shouldn't've": [ + { + "F": "should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "10a.m.": [ + { + "F": "10" + }, + { + "F": "a.m." + } + ], + "E.G.": [ + { + "F": "E.G." + } + ], + "howll": [ + { + "F": "how" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "6a.m.": [ + { + "F": "6" + }, + { + "F": "a.m." + } + ], + "Ore.": [ + { + "F": "Ore." + } + ], + "Hadn't've": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + ":>": [ + { + "F": ":>" + } + ], + "3p.m.": [ + { + "F": "3" + }, + { + "F": "p.m." + } + ], + "who'll": [ + { + "F": "who" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "5a.m.": [ + { + "F": "5" + }, + { + "F": "a.m." + } + ], + ":(": [ + { + "F": ":(" + } + ], + ":0": [ + { + "F": ":0" + } + ], + ":)": [ + { + "F": ":)" + } + ], + "aint": [ + { + "F": "ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + " ": [ + { + "pos": "SP", + "F": " " + } + ], + "Dec.": [ + { + "F": "Dec." + } + ], + "Shouldnt": [ + { + "F": "Should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Ky.": [ + { + "F": "Ky." + } + ], + "when's": [ + { + "F": "when" + }, + { + "F": "'s" + } + ], + "Didnt": [ + { + "F": "Did", + "L": "do", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "itll": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "Who're": [ + { + "F": "Who" + }, + { + "F": "'re" + } + ], + "=D": [ + { + "F": "=D" + } + ], + "Ain't": [ + { + "F": "Ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Can't": [ + { + "F": "Ca", + "L": "can", + "pos": "MD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Whyre": [ + { + "F": "Why" + }, + { + "F": "re" + } + ], + "Aren't": [ + { + "F": "Are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Neednt": [ + { + "F": "Need" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "should've": [ + { + "F": "should" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "shouldn't": [ + { + "F": "should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Idve": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "weve": [ + { + "F": "we" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Va.": [ + { + "F": "Va." + } + ], + "D.C.": [ + { + "F": "D.C." + } + ], + "3am": [ + { + "F": "3" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Ive": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Md.": [ + { + "F": "Md." + } + ], + ";D": [ + { + "F": ";D" + } + ], + "Mrs.": [ + { + "F": "Mrs." + } + ], + "Minn.": [ + { + "F": "Minn." + } + ], + "they'd": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Youdve": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "theyve": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Weren't": [ + { + "F": "Were" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "werent": [ + { + "F": "were" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whyre": [ + { + "F": "why" + }, + { + "F": "re" + } + ], + "g.": [ + { + "F": "g." + } + ], + "I'm": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "pos": "VBP", + "F": "'m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + ":p": [ + { + "F": ":p" + } + ], + "She'd've": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "not've": [ + { + "F": "not", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "we'll": [ + { + "F": "we" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + ":O": [ + { + "F": ":O" + } + ], + "<33": [ + { + "F": "<33" + } + ], + "Don't": [ + { + "L": "do", + "F": "Do" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Whyll": [ + { + "F": "Why" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "''": [ + { + "F": "''" + } + ], + "they've": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "t.": [ + { + "F": "t." + } + ], + "wasn't": [ + { + "F": "was" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "could've": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "what've": [ + { + "F": "what" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "havent": [ + { + "pos": "VB", + "F": "have" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Who've": [ + { + "F": "Who" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "11am": [ + { + "F": "11" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Shan't": [ + { + "F": "Sha" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "i'll": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "i.e.": [ + { + "F": "i.e." + } + ], + "you'd": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "w.": [ + { + "F": "w." + } + ], + "whens": [ + { + "F": "when" + }, + { + "F": "s" + } + ], + "whys": [ + { + "F": "why" + }, + { + "F": "s" + } + ], + "6pm": [ + { + "F": "6" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "4p.m.": [ + { + "F": "4" + }, + { + "F": "p.m." + } + ], + "Whereve": [ + { + "F": "Where" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "o_o": [ + { + "F": "o_o" + } + ], + "Mo.": [ + { + "F": "Mo." + } + ], + "Kan.": [ + { + "F": "Kan." + } + ], + "\u00a0": [ + { + "pos": "SP", + "L": " ", + "F": "\u00a0" + } + ], + "there'd": [ + { + "F": "there" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "N.H.": [ + { + "F": "N.H." + } + ], + "(^_^)": [ + { + "F": "(^_^)" + } + ], + "Mont.": [ + { + "F": "Mont." + } + ], + "hadn't've": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "whatll": [ + { + "F": "what" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "wouldn't've": [ + { + "F": "would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "there's": [ + { + "F": "there" + }, + { + "F": "'s" + } + ], + "Who'll": [ + { + "F": "Who" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "o_O": [ + { + "F": "o_O" + } + ], + "Nev.": [ + { + "F": "Nev." + } + ], + "youll": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "wouldve": [ + { + "F": "would" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Nov.": [ + { + "F": "Nov." + } + ], + "z.": [ + { + "F": "z." + } + ], + "xDD": [ + { + "F": "xDD" + } + ], + "Sen.": [ + { + "F": "Sen." + } + ], + "Wouldnt": [ + { + "F": "Would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Thered": [ + { + "F": "There" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Youre": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "re" + } + ], + "Couldn't've": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "who're": [ + { + "F": "who" + }, + { + "F": "'re" + } + ], + "Whys": [ + { + "F": "Why" + }, + { + "F": "s" + } + ], + "mightn't've": [ + { + "F": "might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Wholl": [ + { + "F": "Who" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "hadn't": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Havent": [ + { + "pos": "VB", + "F": "Have" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Whatve": [ + { + "F": "What" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "2pm": [ + { + "F": "2" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "o.O": [ + { + "F": "o.O" + } + ], + "Thats": [ + { + "F": "That" + }, + { + "F": "s" + } + ], + "Gov.": [ + { + "F": "Gov." + } + ], + "Howll": [ + { + "F": "How" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "p.": [ + { + "F": "p." + } + ], + "wouldn't": [ + { + "F": "would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "9pm": [ + { + "F": "9" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "You'll": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Ala.": [ + { + "F": "Ala." + } + ], + "12am": [ + { + "F": "12" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "=]": [ + { + "F": "=]" + } + ], + "Cant": [ + { + "F": "Ca", + "L": "can", + "pos": "MD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "i'd": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "a.m.": [ + { + "F": "a.m." + } + ], + "weren't": [ + { + "F": "were" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "would've": [ + { + "F": "would" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "i'm": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "pos": "VBP", + "F": "'m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + "why'll": [ + { + "F": "why" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "we'd've": [ + { + "F": "we" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Shouldve": [ + { + "F": "Should" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "can't": [ + { + "F": "ca", + "L": "can", + "pos": "MD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "thats": [ + { + "F": "that" + }, + { + "F": "s" + } + ], + "1p.m.": [ + { + "F": "1" + }, + { + "F": "p.m." + } + ], + "12a.m.": [ + { + "F": "12" + }, + { + "F": "a.m." + } + ], + "Hes": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "s" + } + ], + "Needn't": [ + { + "F": "Need" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "It's": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'s" + } + ], + "St.": [ + { + "F": "St." + } + ], + "Why're": [ + { + "F": "Why" + }, + { + "F": "'re" + } + ], + ":(((": [ + { + "F": ":(((" + } + ], + "Hed": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Mt.": [ + { + "L": "Mount", + "F": "Mt." + } + ], + "couldn't": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "What've": [ + { + "F": "What" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "4a.m.": [ + { + "F": "4" + }, + { + "F": "a.m." + } + ], + "Ind.": [ + { + "F": "Ind." + } + ], + "It'd": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "<3": [ + { + "F": "<3" + } + ], + "theydve": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "aren't": [ + { + "F": "are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Mightn't": [ + { + "F": "Might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "'S": [ + { + "L": "'s", + "F": "'S" + } + ], + "I've": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whered": [ + { + "F": "Where" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Itdve": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "I'ma": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'ma" + } + ], + "whos": [ + { + "F": "who" + }, + { + "F": "s" + } + ], + "They'd": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "What'll": [ + { + "F": "What" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + ":Y": [ + { + "F": ":Y" + } + ], + "You've": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Mustve": [ + { + "F": "Must" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "whod": [ + { + "F": "who" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "mightntve": [ + { + "F": "might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "I'd've": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Must've": [ + { + "F": "Must" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "it'd": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Ark.": [ + { + "F": "Ark." + } + ], + "Wis.": [ + { + "F": "Wis." + } + ], + "6p.m.": [ + { + "F": "6" + }, + { + "F": "p.m." + } + ], + "what're": [ + { + "F": "what" + }, + { + "F": "'re" + } + ], + "N.C.": [ + { + "F": "N.C." + } + ], + "Wasn't": [ + { + "F": "Was" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "what's": [ + { + "F": "what" + }, + { + "F": "'s" + } + ], + "he'd've": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Jan.": [ + { + "F": "Jan." + } + ], + "She'd": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "shedve": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Tenn.": [ + { + "F": "Tenn." + } + ], + "ain't": [ + { + "F": "ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Wash.": [ + { + "F": "Wash." + } + ], + "She's": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'s" + } + ], + "i'd've": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "2a.m.": [ + { + "F": "2" + }, + { + "F": "a.m." + } + ], + "We'd've": [ + { + "F": "We" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "must've": [ + { + "F": "must" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "That's": [ + { + "F": "That" + }, + { + "F": "'s" + } + ], + "Sept.": [ + { + "F": "Sept." + } + ], + "whatre": [ + { + "F": "what" + }, + { + "F": "re" + } + ], + "you'd've": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Dont": [ + { + "L": "do", + "F": "Do" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "i.": [ + { + "F": "i." + } + ], + "Jun.": [ + { + "F": "Jun." + } + ], + "thered": [ + { + "F": "there" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Youd": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "couldn't've": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whens": [ + { + "F": "When" + }, + { + "F": "s" + } + ], + "8a.m.": [ + { + "F": "8" + }, + { + "F": "a.m." + } + ], + "Isnt": [ + { + "F": "Is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "mightve": [ + { + "F": "might" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "'ol": [ + { + "F": "'ol" + } + ], + "2p.m.": [ + { + "F": "2" + }, + { + "F": "p.m." + } + ], + "9a.m.": [ + { + "F": "9" + }, + { + "F": "a.m." + } + ], + "q.": [ + { + "F": "q." + } + ], + "didnt": [ + { + "F": "did", + "L": "do", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "ive": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "It'd've": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "e.g.": [ + { + "F": "e.g." + } + ], + "\t": [ + { + "pos": "SP", + "F": "\t" + } + ], + "Mich.": [ + { + "F": "Mich." + } + ], + "Itll": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "didn't": [ + { + "F": "did", + "L": "do", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "3pm": [ + { + "F": "3" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "Jul.": [ + { + "F": "Jul." + } + ], + "7pm": [ + { + "F": "7" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "cant": [ + { + "F": "ca", + "L": "can", + "pos": "MD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Miss.": [ + { + "F": "Miss." + } + ], + "im": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "pos": "VBP", + "F": "m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + "Ariz.": [ + { + "F": "Ariz." + } + ], + "they'd've": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "f.": [ + { + "F": "f." + } + ], + "Co.": [ + { + "F": "Co." + } + ], + "Hadntve": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Weve": [ + { + "F": "We" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "1a.m.": [ + { + "F": "1" + }, + { + "F": "a.m." + } + ], + "=3": [ + { + "F": "=3" + } + ], + "Mightnt": [ + { + "F": "Might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "1pm": [ + { + "F": "1" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "youdve": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Shedve": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "theyd": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Ill.": [ + { + "F": "Ill." + } + ], + "N.D.": [ + { + "F": "N.D." + } + ], + "Cannot": [ + { + "F": "Can", + "L": "can", + "pos": "MD" + }, + { + "F": "not", + "L": "not", + "pos": "RB" + } + ], + "s.": [ + { + "F": "s." + } + ], + "Hadn't": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "What're": [ + { + "F": "What" + }, + { + "F": "'re" + } + ], + "He'll": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "wholl": [ + { + "F": "who" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "They're": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'re" + } + ], + "Neb.": [ + { + "F": "Neb." + } + ], + "shouldnt": [ + { + "F": "should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "\n": [ + { + "pos": "SP", + "F": "\n" + } + ], + "whered": [ + { + "F": "where" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "7a.m.": [ + { + "F": "7" + }, + { + "F": "a.m." + } + ], + "youve": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "4am": [ + { + "F": "4" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "v.": [ + { + "F": "v." + } + ], + "notve": [ + { + "F": "not", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "couldve": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "mustve": [ + { + "F": "must" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Youve": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "might've": [ + { + "F": "might" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Mustn't": [ + { + "F": "Must" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "wheres": [ + { + "F": "where" + }, + { + "F": "s" + } + ], + "they're": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'re" + } + ], + "idve": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "hows": [ + { + "F": "how" + }, + { + "F": "s" + } + ], + "Fla.": [ + { + "F": "Fla." + } + ], + "N.M.": [ + { + "F": "N.M." + } + ], + "youre": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "re" + } + ], + "Didn't": [ + { + "F": "Did", + "L": "do", + "pos": "VBD" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Couldve": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "10p.m.": [ + { + "F": "10" + }, + { + "F": "p.m." + } + ], + "Del.": [ + { + "F": "Del." + } + ], + "Oct.": [ + { + "F": "Oct." + } + ], + "Rep.": [ + { + "F": "Rep." + } + ], + "cannot": [ + { + "F": "can", + "L": "can", + "pos": "MD" + }, + { + "F": "not", + "L": "not", + "pos": "RB" + } + ], + "Im": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "pos": "VBP", + "F": "m", + "tenspect": 1, + "number": 1, + "L": "be" + } + ], + "howd": [ + { + "F": "how" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Okla.": [ + { + "F": "Okla." + } + ], + "Feb.": [ + { + "F": "Feb." + } + ], + "you've": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "You're": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'re" + } + ], + "she'll": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Theyll": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "don't": [ + { + "L": "do", + "F": "do" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "itd": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + ":-)": [ + { + "F": ":-)" + } + ], + "Hedve": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "isnt": [ + { + "F": "is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "won't": [ + { + "F": "wo" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "We're": [ + { + "F": "We" + }, + { + "F": "'re" + } + ], + "3a.m.": [ + { + "F": "3" + }, + { + "F": "a.m." + } + ], + "^_^": [ + { + "F": "^_^" + } + ], + "\u2018S": [ + { + "L": "'s", + "F": "\u2018S" + } + ], + "9p.m.": [ + { + "F": "9" + }, + { + "F": "p.m." + } + ], + "dont": [ + { + "L": "do", + "F": "do" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "ima": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "ma" + } + ], + "he's": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'s" + } + ], + "we've": [ + { + "F": "we" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "What's": [ + { + "F": "What" + }, + { + "F": "'s" + } + ], + "Who's": [ + { + "F": "Who" + }, + { + "F": "'s" + } + ], + "-__-": [ + { + "F": "-__-" + } + ], + "hedve": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "he'd": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "When's": [ + { + "F": "When" + }, + { + "F": "'s" + } + ], + "Mightn't've": [ + { + "F": "Might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "We've": [ + { + "F": "We" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "\u2018s": [ + { + "L": "'s", + "F": "\u2018s" + } + ], + "Couldntve": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Who'd": [ + { + "F": "Who" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + ":-/": [ + { + "F": ":-/" + } + ], + "haven't": [ + { + "pos": "VB", + "F": "have" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Gen.": [ + { + "F": "Gen." + } + ], + "(:": [ + { + "F": "(:" + } + ], + "arent": [ + { + "F": "are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "You'd've": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "c.": [ + { + "F": "c." + } + ], + "(=": [ + { + "F": "(=" + } + ], + "Wouldn't": [ + { + "F": "Would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "who's": [ + { + "F": "who" + }, + { + "F": "'s" + } + ], + "12p.m.": [ + { + "F": "12" + }, + { + "F": "p.m." + } + ], + "5am": [ + { + "F": "5" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Mightve": [ + { + "F": "Might" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + ":((": [ + { + "F": ":((" + } + ], + "theredve": [ + { + "F": "there" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Messrs.": [ + { + "F": "Messrs." + } + ], + "who'd": [ + { + "F": "who" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Where's": [ + { + "F": "Where" + }, + { + "F": "'s" + } + ], + "wont": [ + { + "F": "wo" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "she'd've": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "10pm": [ + { + "F": "10" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "Corp.": [ + { + "F": "Corp." + } + ], + "Aug.": [ + { + "F": "Aug." + } + ], + "-_-": [ + { + "F": "-_-" + } + ], + "y.": [ + { + "F": "y." + } + ], + "Should've": [ + { + "F": "Should" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "11pm": [ + { + "F": "11" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "8am": [ + { + "F": "8" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "theyre": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "re" + } + ], + "l.": [ + { + "F": "l." + } + ], + "Wouldntve": [ + { + "F": "Would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Ga.": [ + { + "F": "Ga." + } + ], + "1am": [ + { + "F": "1" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "Where've": [ + { + "F": "Where" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "11a.m.": [ + { + "F": "11" + }, + { + "F": "a.m." + } + ], + "mustn't": [ + { + "F": "must" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "isn't": [ + { + "F": "is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Bros.": [ + { + "F": "Bros." + } + ], + "Aint": [ + { + "F": "Ai", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "why's": [ + { + "F": "why" + }, + { + "F": "'s" + } + ], + "V_V": [ + { + "F": "V_V" + } + ], + ";p": [ + { + "F": ";p" + } + ], + "There'd": [ + { + "F": "There" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "They'll": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "=)": [ + { + "F": "=)" + } + ], + "b.": [ + { + "F": "b." + } + ], + "how'll": [ + { + "F": "how" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Wedve": [ + { + "F": "We" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "couldntve": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "12pm": [ + { + "F": "12" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "There's": [ + { + "F": "There" + }, + { + "F": "'s" + } + ], + "we'd": [ + { + "F": "we" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Dr.": [ + { + "F": "Dr." + } + ], + "Whod": [ + { + "F": "Who" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + ":-P": [ + { + "F": ":-P" + } + ], + "whatve": [ + { + "F": "what" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Wouldve": [ + { + "F": "Would" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "o.": [ + { + "F": "o." + } + ], + ":]": [ + { + "F": ":]" + } + ], + "needn't": [ + { + "F": "need" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "shouldntve": [ + { + "F": "should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "why're": [ + { + "F": "why" + }, + { + "F": "'re" + } + ], + "p.m.": [ + { + "F": "p.m." + } + ], + "Doesnt": [ + { + "F": "Does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whereve": [ + { + "F": "where" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "they'll": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "I'd": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Might've": [ + { + "F": "Might" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "mightnt": [ + { + "F": "might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Kans.": [ + { + "F": "Kans." + } + ], + "Not've": [ + { + "F": "Not", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "e.": [ + { + "F": "e." + } + ], + "mightn't": [ + { + "F": "might" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "you're": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'re" + } + ], + "Mar.": [ + { + "F": "Mar." + } + ], + "They've": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "\")": [ + { + "F": "\")" + } + ], + "what'll": [ + { + "F": "what" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Calif.": [ + { + "F": "Calif." + } + ], + "Could've": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Would've": [ + { + "F": "Would" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + ";)": [ + { + "F": ";)" + } + ], + ";(": [ + { + "F": ";(" + } + ], + "Isn't": [ + { + "F": "Is", + "L": "be", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "let's": [ + { + "F": "let" + }, + { + "F": "'s" + } + ], + "'em": [ + { + "F": "'em" + } + ], + "She'll": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "I.E.": [ + { + "F": "I.E." + } + ], + "You'd": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "wouldnt": [ + { + "F": "would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "6am": [ + { + "F": "6" + }, + { + "L": "a.m.", + "F": "am" + } + ], + ":P": [ + { + "F": ":P" + } + ], + "Why'll": [ + { + "F": "Why" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Where'd": [ + { + "F": "Where" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Theyre": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "re" + } + ], + "11p.m.": [ + { + "F": "11" + }, + { + "F": "p.m." + } + ], + "Won't": [ + { + "F": "Wo" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Couldn't": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "it's": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'s" + } + ], + "r.": [ + { + "F": "r." + } + ], + "it'll": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "They'd've": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Ima": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "ma" + } + ], + "5pm": [ + { + "F": "5" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "10am": [ + { + "F": "10" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "m.": [ + { + "F": "m." + } + ], + "whats": [ + { + "F": "what" + }, + { + "F": "s" + } + ], + "How's": [ + { + "F": "How" + }, + { + "F": "'s" + } + ], + "Sep.": [ + { + "F": "Sep." + } + ], + "Shouldntve": [ + { + "F": "Should" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "youd": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "Whatll": [ + { + "F": "What" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "Wouldn't've": [ + { + "F": "Would" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "How'd": [ + { + "F": "How" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "doesnt": [ + { + "F": "does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "h.": [ + { + "F": "h." + } + ], + "Shouldn't": [ + { + "F": "Should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "He'd've": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Mightntve": [ + { + "F": "Might" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "couldnt": [ + { + "pos": "MD", + "F": "could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Haven't": [ + { + "pos": "VB", + "F": "Have" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "<333": [ + { + "F": "<333" + } + ], + "doesn't": [ + { + "F": "does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Hasn't": [ + { + "F": "Has" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "how's": [ + { + "F": "how" + }, + { + "F": "'s" + } + ], + "hes": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "s" + } + ], + "=[[": [ + { + "F": "=[[" + } + ], + "xD": [ + { + "F": "xD" + } + ], + "he'll": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "hed": [ + { + "L": "-PRON-", + "F": "he" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "7p.m.": [ + { + "F": "7" + }, + { + "F": "p.m." + } + ], + "how'd": [ + { + "F": "how" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "u.": [ + { + "F": "u." + } + ], + "we're": [ + { + "F": "we" + }, + { + "F": "'re" + } + ], + "vs.": [ + { + "F": "vs." + } + ], + "Hadnt": [ + { + "F": "Had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Shant": [ + { + "F": "Sha" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Theyve": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Hows": [ + { + "F": "How" + }, + { + "F": "s" + } + ], + "We'll": [ + { + "F": "We" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "N.Y.": [ + { + "F": "N.Y." + } + ], + "x.": [ + { + "F": "x." + } + ], + "8p.m.": [ + { + "F": "8" + }, + { + "F": "p.m." + } + ], + "i've": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whove": [ + { + "F": "Who" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "2am": [ + { + "F": "2" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "La.": [ + { + "F": "La." + } + ], + "i'ma": [ + { + "L": "-PRON-", + "F": "i" + }, + { + "F": "'ma" + } + ], + "N.J.": [ + { + "F": "N.J." + } + ], + "Nebr.": [ + { + "F": "Nebr." + } + ], + "Howd": [ + { + "F": "How" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "hadnt": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "shant": [ + { + "F": "sha" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "There'd've": [ + { + "F": "There" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Inc.": [ + { + "F": "Inc." + } + ], + "I'll": [ + { + "L": "-PRON-", + "F": "I" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Why's": [ + { + "F": "Why" + }, + { + "F": "'s" + } + ], + "Adm.": [ + { + "F": "Adm." + } + ], + "Shouldn't've": [ + { + "F": "Should" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "n.": [ + { + "F": "n." + } + ], + "Wasnt": [ + { + "F": "Was" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whove": [ + { + "F": "who" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + ";-p": [ + { + "F": ";-p" + } + ], + "hasn't": [ + { + "F": "has" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "wouldntve": [ + { + "F": "would" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Wheres": [ + { + "F": "Where" + }, + { + "F": "s" + } + ], + "How'll": [ + { + "F": "How" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "there'd've": [ + { + "F": "there" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whos": [ + { + "F": "Who" + }, + { + "F": "s" + } + ], + "shes": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "s" + } + ], + "Doesn't": [ + { + "F": "Does", + "L": "do", + "pos": "VBZ" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + "Arent": [ + { + "F": "Are", + "pos": "VBP", + "number": 2, + "L": "be" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Hasnt": [ + { + "F": "Has" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "j.": [ + { + "F": "j." + } + ], + "He's": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'s" + } + ], + "wasnt": [ + { + "F": "was" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "whyll": [ + { + "F": "why" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "co.": [ + { + "F": "co." + } + ], + "mustnt": [ + { + "F": "must" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "He'd": [ + { + "L": "-PRON-", + "F": "He" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "I.e.": [ + { + "F": "I.e." + } + ], + "Shes": [ + { + "L": "-PRON-", + "F": "She" + }, + { + "F": "s" + } + ], + "where've": [ + { + "F": "where" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Youll": [ + { + "L": "-PRON-", + "F": "You" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "Apr.": [ + { + "F": "Apr." + } + ], + ":')": [ + { + "F": ":')" + } + ], + "Conn.": [ + { + "F": "Conn." + } + ], + "8pm": [ + { + "F": "8" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + "9am": [ + { + "F": "9" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "hasnt": [ + { + "F": "has" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "theyll": [ + { + "L": "-PRON-", + "F": "they" + }, + { + "F": "ll", + "L": "will", + "pos": "MD" + } + ], + "it'd've": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "itdve": [ + { + "L": "-PRON-", + "F": "it" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Jr.": [ + { + "F": "Jr." + } + ], + "Rev.": [ + { + "F": "Rev." + } + ], + "k.": [ + { + "F": "k." + } + ], + "wedve": [ + { + "F": "we" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "Let's": [ + { + "F": "Let" + }, + { + "F": "'s" + } + ], + "Colo.": [ + { + "F": "Colo." + } + ], + "Mr.": [ + { + "F": "Mr." + } + ], + "Werent": [ + { + "F": "Were" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Theredve": [ + { + "F": "There" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "shan't": [ + { + "F": "sha" + }, + { + "F": "n't", + "L": "not", + "pos": "RB" + } + ], + ";-)": [ + { + "F": ";-)" + } + ], + "Wont": [ + { + "F": "Wo" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "hadntve": [ + { + "F": "had", + "L": "have", + "pos": "VBD" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "who've": [ + { + "F": "who" + }, + { + "F": "'ve", + "L": "have", + "pos": "VB" + } + ], + "Whatre": [ + { + "F": "What" + }, + { + "F": "re" + } + ], + "'s": [ + { + "L": "'s", + "F": "'s" + } + ], + "where'd": [ + { + "F": "where" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "shouldve": [ + { + "F": "should" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "a.": [ + { + "F": "a." + } + ], + "where's": [ + { + "F": "where" + }, + { + "F": "'s" + } + ], + "Ltd.": [ + { + "F": "Ltd." + } + ], + "Mass.": [ + { + "F": "Mass." + } + ], + "neednt": [ + { + "F": "need" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Pa.": [ + { + "F": "Pa." + } + ], + "It'll": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "7am": [ + { + "F": "7" + }, + { + "L": "a.m.", + "F": "am" + } + ], + "We'd": [ + { + "F": "We" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Whats": [ + { + "F": "What" + }, + { + "F": "s" + } + ], + "\u2014": [ + { + "pos": ":", + "L": "--", + "F": "\u2014" + } + ], + "E.g.": [ + { + "F": "E.g." + } + ], + "Ms.": [ + { + "F": "Ms." + } + ], + ":3": [ + { + "F": ":3" + } + ], + "5p.m.": [ + { + "F": "5" + }, + { + "F": "p.m." + } + ], + "Itd": [ + { + "L": "-PRON-", + "F": "It" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "May.": [ + { + "F": "May." + } + ], + "she'd": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'d", + "L": "would", + "pos": "MD" + } + ], + "Mustnt": [ + { + "F": "Must" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "Notve": [ + { + "F": "Not", + "L": "not", + "pos": "RB" + }, + { + "F": "ve", + "L": "have", + "pos": "VB" + } + ], + "you'll": [ + { + "L": "-PRON-", + "F": "you" + }, + { + "F": "'ll", + "L": "will", + "pos": "MD" + } + ], + "Theyd": [ + { + "L": "-PRON-", + "F": "They" + }, + { + "F": "d", + "L": "would", + "pos": "MD" + } + ], + "she's": [ + { + "L": "-PRON-", + "F": "she" + }, + { + "F": "'s" + } + ], + "Couldnt": [ + { + "pos": "MD", + "F": "Could" + }, + { + "F": "nt", + "L": "not", + "pos": "RB" + } + ], + "that's": [ + { + "F": "that" + }, + { + "F": "'s" + } + ], + "4pm": [ + { + "F": "4" + }, + { + "L": "p.m.", + "F": "pm" + } + ], + ":))": [ + { + "F": ":))" + } + ] +} \ No newline at end of file diff --git a/lang_data/fi/tag_map.json b/lang_data/fi/tag_map.json index 6b21a1e29..4451d0fa0 100644 --- a/lang_data/fi/tag_map.json +++ b/lang_data/fi/tag_map.json @@ -13,5 +13,7 @@ "ADP": {"pos": "ADP"}, "SYM": {"pos": "SYM"}, "X": {"pos": "X"}, - "INTJ": {"pos": "INTJ"} + "INTJ": {"pos": "INTJ"}, + "DET": {"pos": "DET"}, + "PART": {"pos": "PART"} } diff --git a/lang_data/it/tag_map.json b/lang_data/it/tag_map.json index 514e978a6..92f11e457 100644 --- a/lang_data/it/tag_map.json +++ b/lang_data/it/tag_map.json @@ -2,43 +2,43 @@ "S": {"pos": "NOUN"}, "E": {"pos": "ADP"}, "RD": {"pos": "DET"}, -"V": {"pos": "VER"}, -"_": {"pos": "_"}, +"V": {"pos": "VERB"}, +"_": {"pos": "NO_TAG"}, "A": {"pos": "ADJ"}, -"SP": {"pos": "PROP"}, -"FF": {"pos": "PUNC"}, -"FS": {"pos": "PUNC"}, +"SP": {"pos": "PROPN"}, +"FF": {"pos": "PUNCT"}, +"FS": {"pos": "PUNCT"}, "B": {"pos": "ADV"}, -"CC": {"pos": "CON"}, -"FB": {"pos": "PUNC"}, +"CC": {"pos": "CONJ"}, +"FB": {"pos": "PUNCT"}, "VA": {"pos": "AUX"}, -"PC": {"pos": "PRO"}, +"PC": {"pos": "PRON"}, "N": {"pos": "NUM"}, "RI": {"pos": "DET"}, -"PR": {"pos": "PRO"}, -"CS": {"pos": "SCON"}, +"PR": {"pos": "PRON"}, +"CS": {"pos": "SCONJ"}, "BN": {"pos": "ADV"}, "AP": {"pos": "DET"}, "VM": {"pos": "AUX"}, "DI": {"pos": "DET"}, -"FC": {"pos": "PUNC"}, -"PI": {"pos": "PRO"}, +"FC": {"pos": "PUNCT"}, +"PI": {"pos": "PRON"}, "DD": {"pos": "DET"}, "DQ": {"pos": "DET"}, -"PQ": {"pos": "PRO"}, -"PD": {"pos": "PRO"}, +"PQ": {"pos": "PRON"}, +"PD": {"pos": "PRON"}, "NO": {"pos": "ADJ"}, -"PE": {"pos": "PRO"}, +"PE": {"pos": "PRON"}, "T": {"pos": "DET"}, "X": {"pos": "SYM"}, "SW": {"pos": "X"}, -"NO": {"pos": "PRO"}, -"I": {"pos": "INT"}, +"NO": {"pos": "PRON"}, +"I": {"pos": "INTJ"}, "X": {"pos": "X"}, "DR": {"pos": "DET"}, "EA": {"pos": "ADP"}, -"PP": {"pos": "PRO"}, +"PP": {"pos": "PRON"}, "X": {"pos": "NUM"}, "DE": {"pos": "DET"}, -"X": {"pos": "PAR"} +"X": {"pos": "PART"} } diff --git a/setup.py b/setup.py index f76c10082..93f18afbd 100644 --- a/setup.py +++ b/setup.py @@ -134,13 +134,17 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.93' +VERSION = '0.94' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] +<<<<<<< HEAD # This is gcc only. Also -03 is everywhere and is not recognized :() # compile_args = ['-O3', '-Wno-strict-prototypes'] compile_args = ['-Ox', '-EHsc'] +======= + compile_args = ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] +>>>>>>> refs/remotes/honnibal/master link_args = [] # It is not prefix !!! if sys.prefix == 'darwin': @@ -159,9 +163,13 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.morphology', 'spacy.tagger', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy._theano', +<<<<<<< HEAD 'spacy.tokenizer', #'spacy.en.attrs', #'spacy.en.pos', +======= + 'spacy.tokenizer', +>>>>>>> refs/remotes/honnibal/master 'spacy.syntax.parser', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', diff --git a/spacy/en/download.py b/spacy/en/download.py index 20e7b5b95..01c87a4e4 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -7,7 +7,7 @@ import wget import plac # TODO: Read this from the same source as the setup -VERSION = '0.9.0' +VERSION = '0.9.1' AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5d0ad36c0..d8b100744 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,5 +1,7 @@ import numpy -import codecs +import io +import json +import ujson import random import re import os diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index ed04e2d77..c1d296d7c 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function from os import path import codecs @@ -7,7 +7,7 @@ try: except ImportError: import json -from .parts_of_speech import NOUN, VERB, ADJ +from .parts_of_speech import NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @@ -36,6 +36,8 @@ class Lemmatizer(object): pos = 'verb' elif pos == ADJ: pos = 'adj' + elif pos == PUNCT: + pos = 'punct' lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) return lemmas @@ -48,6 +50,9 @@ class Lemmatizer(object): def adj(self, string): return self(string, 'adj') + def punct(self, string): + return self(string, 'punct') + def lemmatize(string, index, exceptions, rules): string = string.lower() @@ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules): for old, new in rules: if string.endswith(old): form = string[:len(string) - len(old)] + new - if form in index: + if form in index or not form.isalpha(): forms.append(form) if not forms: forms.append(string) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index eb33b1980..afafd3ddb 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -1,11 +1,18 @@ +# cython: profile=True +from __future__ import unicode_literals + from os import path from .typedefs cimport attr_t +from .typedefs cimport hash_t from .attrs cimport attr_id_t -from .structs cimport TokenC +from .structs cimport TokenC, LexemeC +from .lexeme cimport Lexeme from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap from libcpp.vector cimport vector +from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 @@ -15,6 +22,38 @@ from .vocab cimport Vocab from libcpp.vector cimport vector +from .attrs import FLAG61 as U_ENT + +from .attrs import FLAG60 as B2_ENT +from .attrs import FLAG59 as B3_ENT +from .attrs import FLAG58 as B4_ENT +from .attrs import FLAG57 as B5_ENT +from .attrs import FLAG56 as B6_ENT +from .attrs import FLAG55 as B7_ENT +from .attrs import FLAG54 as B8_ENT +from .attrs import FLAG53 as B9_ENT +from .attrs import FLAG52 as B10_ENT + +from .attrs import FLAG51 as I3_ENT +from .attrs import FLAG50 as I4_ENT +from .attrs import FLAG49 as I5_ENT +from .attrs import FLAG48 as I6_ENT +from .attrs import FLAG47 as I7_ENT +from .attrs import FLAG46 as I8_ENT +from .attrs import FLAG45 as I9_ENT +from .attrs import FLAG44 as I10_ENT + +from .attrs import FLAG43 as L2_ENT +from .attrs import FLAG42 as L3_ENT +from .attrs import FLAG41 as L4_ENT +from .attrs import FLAG40 as L5_ENT +from .attrs import FLAG39 as L6_ENT +from .attrs import FLAG38 as L7_ENT +from .attrs import FLAG37 as L8_ENT +from .attrs import FLAG36 as L9_ENT +from .attrs import FLAG35 as L10_ENT + + try: import ujson as json except ImportError: @@ -81,7 +120,33 @@ def _convert_strings(token_specs, string_store): value = int(value) converted[-1].append((attr, value)) return converted - + + +def get_bilou(length): + if length == 1: + return [U_ENT] + elif length == 2: + return [B2_ENT, L2_ENT] + elif length == 3: + return [B3_ENT, I3_ENT, L3_ENT] + elif length == 4: + return [B4_ENT, I4_ENT, I4_ENT, L4_ENT] + elif length == 5: + return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT] + elif length == 6: + return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT] + elif length == 7: + return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT] + elif length == 8: + return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT] + elif length == 9: + return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT] + elif length == 10: + return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, + I10_ENT, I10_ENT, L10_ENT] + else: + raise ValueError("Max length currently 10 for phrase matching") + def map_attr_name(attr): attr = attr.upper() @@ -95,32 +160,6 @@ def map_attr_name(attr): return SHAPE elif attr == 'NORM': return NORM - elif attr == 'FLAG13': - return FLAG13 - elif attr == 'FLAG14': - return FLAG14 - elif attr == 'FLAG15': - return FLAG15 - elif attr == 'FLAG16': - return FLAG16 - elif attr == 'FLAG17': - return FLAG17 - elif attr == 'FLAG18': - return FLAG18 - elif attr == 'FLAG19': - return FLAG19 - elif attr == 'FLAG20': - return FLAG20 - elif attr == 'FLAG21': - return FLAG21 - elif attr == 'FLAG22': - return FLAG22 - elif attr == 'FLAG23': - return FLAG23 - elif attr == 'FLAG24': - return FLAG24 - elif attr == 'FLAG25': - return FLAG25 else: raise Exception("TODO: Finish supporting attr mapping %s" % attr) @@ -163,7 +202,7 @@ cdef class Matcher: spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) - def __call__(self, Doc doc): + def __call__(self, Doc doc, acceptor=None): cdef vector[Pattern*] partials cdef int n_partials = 0 cdef int q = 0 @@ -174,21 +213,94 @@ cdef class Matcher: for token_i in range(doc.length): token = &doc.data[token_i] q = 0 + # Go over the open matches, extending or finalizing if able. Otherwise, + # we over-write them (q doesn't advance) for i in range(partials.size()): state = partials.at(i) if match(state, token): if is_final(state): - matches.append(get_entity(state, token, token_i)) + label, start, end = get_entity(state, token, token_i) + if acceptor is None or acceptor(doc, label, start, end): + matches.append((label, start, end)) else: partials[q] = state + 1 q += 1 partials.resize(q) + # Check whether we open any new patterns on this token for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): if is_final(state): - matches.append(get_entity(state, token, token_i)) + label, start, end = get_entity(state, token, token_i) + if acceptor is None or acceptor(doc, label, start, end): + matches.append((label, start, end)) else: partials.push_back(state + 1) doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches return matches + + +cdef class PhraseMatcher: + cdef Pool mem + cdef Vocab vocab + cdef Matcher matcher + cdef PreshMap phrase_ids + + cdef int max_length + cdef attr_t* _phrase_key + + def __init__(self, Vocab vocab, phrases, max_length=10): + self.mem = Pool() + self._phrase_key = self.mem.alloc(max_length, sizeof(attr_t)) + self.max_length = max_length + self.vocab = vocab + self.matcher = Matcher(self.vocab, {}) + self.phrase_ids = PreshMap() + for phrase in phrases: + if len(phrase) < max_length: + self.add(phrase) + + abstract_patterns = [] + for length in range(1, max_length): + abstract_patterns.append([{tag: True} for tag in get_bilou(length)]) + self.matcher.add('Candidate', 'MWE', {}, abstract_patterns) + + def add(self, Doc tokens): + cdef int length = tokens.length + assert length < self.max_length + tags = get_bilou(length) + assert len(tags) == length, length + + cdef int i + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, tag in enumerate(tags): + lexeme = self.vocab[tokens.data[i].lex.orth] + lexeme.set_flag(tag, True) + self._phrase_key[i] = lexeme.orth + cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) + self.phrase_ids[key] = True + + def __call__(self, Doc doc): + matches = [] + for label, start, end in self.matcher(doc, acceptor=self.accept_match): + cand = doc[start : end] + start = cand[0].idx + end = cand[-1].idx + len(cand[-1]) + matches.append((start, end, cand.root.tag_, cand.text, 'MWE')) + for match in matches: + doc.merge(*match) + return matches + + def accept_match(self, Doc doc, int label, int start, int end): + assert (end - start) < self.max_length + cdef int i, j + for i in range(self.max_length): + self._phrase_key[i] = 0 + for i, j in enumerate(range(start, end)): + self._phrase_key[i] = doc.data[j].lex.orth + cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) + if self.phrase_ids.get(key): + return True + else: + return False diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index ddeca62d7..534f64a59 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -7,7 +7,7 @@ except ImportError: import json from .parts_of_speech import UNIV_POS_NAMES -from .parts_of_speech cimport ADJ, VERB, NOUN +from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT cdef class Morphology: @@ -31,10 +31,7 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int tag_id if isinstance(tag, basestring): - try: - tag_id = self.reverse_index[self.strings[tag]] - except KeyError: - raise + tag_id = self.reverse_index[self.strings[tag]] else: tag_id = tag analysis = self._cache.get(tag_id, token.lex.orth) @@ -84,7 +81,7 @@ cdef class Morphology: if self.lemmatizer is None: return orth cdef unicode py_string = self.strings[orth] - if pos != NOUN and pos != VERB and pos != ADJ: + if pos != NOUN and pos != VERB and pos != ADJ and pos != PUNCT: return orth cdef set lemma_strings cdef unicode lemma_string diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 27123bb4e..882e06bf2 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -11,6 +11,7 @@ try: except ImportError: from text_unidecode import unidecode + import re import math @@ -165,7 +166,7 @@ cpdef unicode norm1(unicode string, lower_pc=0.0, upper_pc=0.0, title_pc=0.0): cpdef bytes asciied(unicode string): - cdef str stripped = unidecode(string) + stripped = unidecode(string) if not stripped: return b'???' return stripped.encode('ascii') diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 265018920..561308928 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,7 +9,8 @@ from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC -from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE +from ..lexeme cimport Lexeme from libc.stdint cimport uint32_t from libc.string cimport memcpy @@ -379,8 +380,18 @@ cdef class ArcEager(TransitionSystem): st.fast_forward() cdef int finalize_state(self, StateClass st) nogil: + cdef int i for i in range(st.length): - if st._sent[i].head == 0 and st._sent[i].dep == 0: + # Always attach spaces to the previous word + if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE): + st._sent[i].head = -1 if (i >= 1) else 1 + if st._sent[i].sent_start and st._sent[i].head == -1: + st._sent[i].sent_start = False + # If we had this space token as the start of a sentence, + # move that sentence start forward one + if (i + 1) < st.length and not st._sent[i+1].sent_start: + st._sent[i+1].sent_start = True + elif st._sent[i].head == 0 and st._sent[i].dep == 0: st._sent[i].dep = self.root_label # If we're not using the Break transition, we segment via root-labelled # arcs between the root words. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 8a7d12555..eab6c044e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,6 +21,7 @@ from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray +from ..util import normalize_slice DEF PADDING = 5 @@ -81,20 +82,14 @@ cdef class Doc: self._vector = None def __getitem__(self, object i): - """Get a token. + """Get a Token or a Span from the Doc. Returns: - token (Token): + token (Token) or span (Span): """ if isinstance(i, slice): - if i.step is not None: - raise ValueError("Stepped slices not supported in Span objects." - "Try: list(doc)[start:stop:step] instead.") - if i.start is None: - i = slice(0, i.stop) - if i.stop is None: - i = slice(i.start, len(self)) - return Span(self, i.start, i.stop, label=0) + start, stop = normalize_slice(len(self), i.start, i.stop, i.step) + return Span(self, start, stop, label=0) if i < 0: i = self.length + i diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index c39f8976c..e8d2f2e59 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -9,16 +9,16 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t +from ..util import normalize_slice cdef class Span: """A slice from a Doc object.""" def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): - if start < 0: - start = tokens.length - start - if end < 0: - end = tokens.length - end + if not (0 <= start <= end <= len(tokens)): + raise IndexError + self.doc = tokens self.start = start self.end = end @@ -46,7 +46,13 @@ cdef class Span: return 0 return self.end - self.start - def __getitem__(self, int i): + def __getitem__(self, object i): + if isinstance(i, slice): + start, end = normalize_slice(len(self), i.start, i.stop, i.step) + start += self.start + end += self.start + return Span(self.doc, start, end) + if i < 0: return self.doc[self.end + i] else: diff --git a/spacy/util.py b/spacy/util.py index 9f5b4fe04..849a3e219 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,5 +1,5 @@ from os import path -import codecs +import io import json import re from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE @@ -7,8 +7,28 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE DATA_DIR = path.join(path.dirname(__file__), '..', 'data') +def normalize_slice(length, start, stop, step=None): + if not (step is None or step == 1): + raise ValueError("Stepped slices not supported in Span objects." + "Try: list(tokens)[start:stop:step] instead.") + if start is None: + start = 0 + elif start < 0: + start += length + start = min(length, max(0, start)) + + if stop is None: + stop = length + elif stop < 0: + stop += length + stop = min(length, max(start, stop)) + + assert 0 <= start <= stop <= length + return start, stop + + def utf8open(loc, mode='r'): - return codecs.open(loc, mode, 'utf8') + return io.open(loc, mode, encoding='utf8') def read_lang_data(data_dir): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2cc9094eb..d79da8a79 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -7,7 +7,7 @@ from libc.stdint cimport uint64_t import bz2 from os import path -import codecs +import io import math import json diff --git a/tests/matcher/test_matcher_bugfixes.py b/tests/matcher/test_matcher_bugfixes.py index c768021db..b65541460 100644 --- a/tests/matcher/test_matcher_bugfixes.py +++ b/tests/matcher/test_matcher_bugfixes.py @@ -3,6 +3,7 @@ import pytest from spacy.matcher import Matcher +@pytest.mark.xfail def test_overlap_issue118(EN): '''Test a bug that arose from having overlapping matches''' doc = EN.tokenizer(u'how many points did lebron james score against the boston celtics last night') diff --git a/tests/parser/test_parse_navigate.py b/tests/parser/test_parse_navigate.py index a1c8b1a87..1771dbeba 100644 --- a/tests/parser/test_parse_navigate.py +++ b/tests/parser/test_parse_navigate.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals from os import path -import codecs +import io import pytest @pytest.fixture def sun_text(): - with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_: + with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_: text = file_.read() return text diff --git a/tests/tagger/test_lemmatizer.py b/tests/tagger/test_lemmatizer.py index 8461a854e..ff10b6573 100644 --- a/tests/tagger/test_lemmatizer.py +++ b/tests/tagger/test_lemmatizer.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals from spacy.lemmatizer import Lemmatizer, read_index, read_exc @@ -34,3 +35,9 @@ def test_noun_lemmas(lemmatizer): assert do('planets') == set(['planet']) assert do('ring') == set(['ring']) assert do('axes') == set(['axis', 'axe', 'ax']) + + +def test_smart_quotes(lemmatizer): + do = lemmatizer.punct + assert do('“') == set(['"']) + assert do('“') == set(['"']) diff --git a/tests/tokens/test_array.py b/tests/tokens/test_array.py index 29807c3e5..bdfdfd057 100644 --- a/tests/tokens/test_array.py +++ b/tests/tokens/test_array.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest -from spacy.en import attrs +from spacy import attrs def test_attr_of_token(EN): diff --git a/tests/tokens/test_token_api.py b/tests/tokens/test_token_api.py index 99c99fc11..6deaadfbf 100644 --- a/tests/tokens/test_token_api.py +++ b/tests/tokens/test_token_api.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals from spacy.en import English -from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT -from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM -from spacy.en.attrs import IS_STOP +from spacy.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT +from spacy.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM +from spacy.attrs import IS_STOP import pytest diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index e1238373f..675f00235 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -12,6 +12,72 @@ def test_getitem(EN): with pytest.raises(IndexError): tokens[len(tokens)] + def to_str(span): + return '/'.join(token.orth_ for token in span) + + span = tokens[1:1] + assert not to_str(span) + span = tokens[1:4] + assert to_str(span) == 'it/back/!' + span = tokens[1:4:1] + assert to_str(span) == 'it/back/!' + with pytest.raises(ValueError): + tokens[1:4:2] + with pytest.raises(ValueError): + tokens[1:4:-1] + + span = tokens[-3:6] + assert to_str(span) == 'He/pleaded' + span = tokens[4:-1] + assert to_str(span) == 'He/pleaded' + span = tokens[-5:-3] + assert to_str(span) == 'back/!' + span = tokens[5:4] + assert span.start == span.end == 5 and not to_str(span) + span = tokens[4:-3] + assert span.start == span.end == 4 and not to_str(span) + + span = tokens[:] + assert to_str(span) == 'Give/it/back/!/He/pleaded/.' + span = tokens[4:] + assert to_str(span) == 'He/pleaded/.' + span = tokens[:4] + assert to_str(span) == 'Give/it/back/!' + span = tokens[:-3] + assert to_str(span) == 'Give/it/back/!' + span = tokens[-3:] + assert to_str(span) == 'He/pleaded/.' + + span = tokens[4:50] + assert to_str(span) == 'He/pleaded/.' + span = tokens[-50:4] + assert to_str(span) == 'Give/it/back/!' + span = tokens[-50:-40] + assert span.start == span.end == 0 and not to_str(span) + span = tokens[40:50] + assert span.start == span.end == 7 and not to_str(span) + + span = tokens[1:4] + assert span[0].orth_ == 'it' + subspan = span[:] + assert to_str(subspan) == 'it/back/!' + subspan = span[:2] + assert to_str(subspan) == 'it/back' + subspan = span[1:] + assert to_str(subspan) == 'back/!' + subspan = span[:-1] + assert to_str(subspan) == 'it/back' + subspan = span[-2:] + assert to_str(subspan) == 'back/!' + subspan = span[1:2] + assert to_str(subspan) == 'back' + subspan = span[-2:-1] + assert to_str(subspan) == 'back' + subspan = span[-50:50] + assert to_str(subspan) == 'it/back/!' + subspan = span[50:-50] + assert subspan.start == subspan.end == 4 and not to_str(subspan) + @pytest.mark.models def test_serialize(EN): diff --git a/tests/vocab/test_lexeme_flags.py b/tests/vocab/test_lexeme_flags.py index 844ee0aaa..5cc7bd16f 100644 --- a/tests/vocab/test_lexeme_flags.py +++ b/tests/vocab/test_lexeme_flags.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from spacy.en.attrs import * +from spacy.attrs import * def test_is_alpha(en_vocab): diff --git a/tests/website/test_api.py b/tests/website/test_api.py index 4ef1a54aa..c173c2b74 100644 --- a/tests/website/test_api.py +++ b/tests/website/test_api.py @@ -26,6 +26,7 @@ def test_main_entry_point(nlp): doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. +@pytest.mark.models def test_sentence_spans(nlp): # from spacy.en import English # nlp = English() @@ -33,6 +34,7 @@ def test_sentence_spans(nlp): assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] +@pytest.mark.models def test_entity_spans(nlp): # from spacy.en import English # nlp = English() @@ -44,6 +46,7 @@ def test_entity_spans(nlp): assert ents[0].string == ents[0].string +@pytest.mark.models def test_noun_chunk_spans(nlp): # from spacy.en import English # nlp = English() @@ -56,11 +59,12 @@ def test_noun_chunk_spans(nlp): # NP three noun chunks <-- has +@pytest.mark.models def test_count_by(nlp): # from spacy.en import English, attrs # nlp = English() import numpy - from spacy.en import attrs + from spacy import attrs tokens = nlp('apple apple orange banana') assert tokens.count_by(attrs.ORTH) == {2529: 2, 4117: 1, 6650: 1} assert repr(tokens.to_array([attrs.ORTH])) == repr(numpy.array([[2529], @@ -76,7 +80,7 @@ def test_read_bytes(nlp): file_.write(nlp(u'This is a document.').to_bytes()) file_.write(nlp(u'This is another.').to_bytes()) docs = [] - with open(loc) as file_: + with open(loc, 'rb') as file_: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2 @@ -88,6 +92,7 @@ def test_token_span(doc): assert token.i == 4 +@pytest.mark.models def test_example_i_like_new_york1(nlp): toks = nlp('I like New York in Autumn.') @@ -127,16 +132,19 @@ def dot(toks): return tok(toks, "dot") +@pytest.mark.models def test_example_i_like_new_york3(toks, new, york): assert toks[new].head.orth_ == 'York' assert toks[york].head.orth_ == 'like' +@pytest.mark.models def test_example_i_like_new_york4(toks, new, york): new_york = toks[new:york+1] assert new_york.root.orth_ == 'York' +@pytest.mark.models def test_example_i_like_new_york5(toks, autumn, dot): assert toks[autumn].head.orth_ == 'in' assert toks[dot].head.orth_ == 'like' @@ -144,6 +152,7 @@ def test_example_i_like_new_york5(toks, autumn, dot): assert autumn_dot.root.orth_ == 'Autumn' +@pytest.mark.models def test_navigating_the_parse_tree_lefts(doc): # TODO: where does the span object come from? span = doc[:2] @@ -151,6 +160,7 @@ def test_navigating_the_parse_tree_lefts(doc): if span.doc[i].head in span] +@pytest.mark.models def test_navigating_the_parse_tree_rights(doc): span = doc[:2] rights = [span.doc[i] for i in range(span.end, len(span.doc)) diff --git a/tests/website/test_home.py b/tests/website/test_home.py index 515c64e6c..4da61becf 100644 --- a/tests/website/test_home.py +++ b/tests/website/test_home.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals import pytest -import spacy.en +import spacy @pytest.fixture() @@ -17,11 +17,12 @@ def test_load_resources_and_process_text(): @pytest.mark.models def test_get_tokens_and_sentences(doc): token = doc[0] - sentence = doc.sents.next() + sentence = next(doc.sents) assert token is sentence[0] assert sentence.text == 'Hello, world.' +@pytest.mark.models def test_use_integer_ids_for_any_strings(nlp, token): hello_id = nlp.vocab.strings['Hello'] hello_str = nlp.vocab.strings[hello_id] @@ -45,7 +46,7 @@ def test_get_and_set_string_views_and_flags(nlp, token): def test_export_to_numpy_arrays(nlp, doc): - from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV + from spacy.attrs import ORTH, LIKE_URL, IS_OOV attr_ids = [ORTH, LIKE_URL, IS_OOV] doc_array = doc.to_array(attr_ids) @@ -68,6 +69,7 @@ def test_word_vectors(nlp): assert apples.similarity(oranges) > boots.similarity(hippos) +@pytest.mark.models def test_part_of_speech_tags(nlp): from spacy.parts_of_speech import ADV diff --git a/website/Makefile b/website/Makefile index 78dc9448c..940a8182c 100644 --- a/website/Makefile +++ b/website/Makefile @@ -12,9 +12,6 @@ site/index.html: src/jade/header.jade src/jade/*.jade site/docs/: src/jade/docs/*.jade src/jade/header.jade jade -P src/jade/docs/index.jade --out $@ -site/license/: src/jade/license/*.jade src/jade/header.jade - jade -P src/jade/license/index.jade --out $@ - site/blog/: src/jade/blog/*.jade site/blog/*/ site/tutorials/*/ src/jade/header.jade jade -P src/jade/blog/index.jade --out $@ diff --git a/website/src/jade/blog/eli5-computers-learn-reading/index.jade b/website/src/jade/blog/eli5-computers-learn-reading/index.jade index 45d2d8bdd..4f3e9ebb1 100644 --- a/website/src/jade/blog/eli5-computers-learn-reading/index.jade +++ b/website/src/jade/blog/eli5-computers-learn-reading/index.jade @@ -24,7 +24,7 @@ include ./meta.jade p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses. - p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit. + p It doesn't always guess right, but we can tell how often it does, and we can think of ways to help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit. p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.) diff --git a/website/src/jade/home/_installation.jade b/website/src/jade/home/_installation.jade index 7a9a14bd5..c0e0b1445 100644 --- a/website/src/jade/home/_installation.jade +++ b/website/src/jade/home/_installation.jade @@ -20,6 +20,11 @@ mixin Option(name, open) | $ conda install spacy | $ python -m spacy.en.download all + p Latest stable conda packages are available from the spacy channel: + + pre.language-bash: code + | $ conda install -c https://conda.anaconda.org/spacy spacy + +Option("pip and virtualenv", true) p With Python 2.7 or Python 3, using Linux or OSX, ensure that you have the following packages installed: diff --git a/website/src/jade/home/_installation.jade~ b/website/src/jade/home/_installation.jade~ deleted file mode 100644 index 9b6b4fa3f..000000000 --- a/website/src/jade/home/_installation.jade~ +++ /dev/null @@ -1,83 +0,0 @@ -mixin Option(name, open) - details(open=open) - summary - h4= name - block - -article.post - header - h2 #[a(href=Meta.url) - - p What's new in v0.90? - - .subhead by #[a(href="//twitter.com/spacy_io", rel="author" target="_blank") #{spaCy}] on #[time #{getDate(Meta.date).fulldate}] - - ul - li Support for gazetteers - li Set Lexeme attributes - #[a.readmore(href=Meta.url) Full Change Log ►] - - -section.intro - p What's - -+Option("conda", true) - pre.language-bash: code - | $ conda install spacy - | $ python -m spacy.en.download - -+Option("pip and virtualenv", true) - p With Python 2.7 or Python 3, using Linux or OSX, run: - - pre.language-bash: code - | $ pip install spacy - | $ python -m spacy.en.download - - p - | The download command fetches and installs about 300mb of data, for - | the parser model and word vectors, which it installs within the spacy.en - | package directory. - - - +Option("Workaround for obsolete system Python", false) - p - | If you're stuck using a server with an old version of Python, and you - | don't have root access, I've prepared a bootstrap script to help you - | compile a local Python install. Run: - - pre.language-bash: code - | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate - - - -+Option("Compile from source", false) - p - | The other way to install the package is to clone the github repository, - | and build it from source. This installs an additional dependency, - | Cython. If you're using Python 2, I also recommend installing fabric - | and fabtools – this is how I build the project. - - pre.language-bash: code - | $ git clone https://github.com/honnibal/spaCy.git - | $ cd spaCy - | $ virtualenv .env && source .env/bin/activate - | $ export PYTHONPATH=`pwd` - | $ pip install -r requirements.txt - | $ python setup.py build_ext --inplace - | $ python -m spacy.en.download - | $ pip install pytest - | $ py.test tests/ - - p - | Python packaging is awkward at the best of times, and it's particularly tricky - | with C extensions, built via Cython, requiring large data files. So, - | please report issues as you encounter them. - -+Option("pypy (Unsupported)") - | If PyPy support is a priority for you, please get in touch. We could likely - | fix the remaining issues, if necessary. However, the library is likely to - | be much slower on PyPy, as it's written in Cython, which produces code tuned - | for the performance of CPython. - -+Option("Windows (Unsupported)") - | Unfortunately we don't currently support Windows. diff --git a/website/src/jade/home/index.jade b/website/src/jade/home/index.jade index f95f4fd53..a77dd323c 100644 --- a/website/src/jade/home/index.jade +++ b/website/src/jade/home/index.jade @@ -29,10 +29,10 @@ include ../header.jade li: a.button(href="#example-use") Examples li: a.button(href="#install") | Install - v0.93 + v0.94 article.page.landing-page +Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade") +Section("Online Demo", "online-demo", "./_online_demo.jade") +Section("Usage by Example", "example-use", "./_usage_examples.jade") - +Section("Install v0.93", "install", "./_installation.jade") + +Section("Install v0.94", "install", "./_installation.jade") diff --git a/website/src/jade/license/index.jade b/website/src/jade/license/index.jade deleted file mode 100644 index b31e99949..000000000 --- a/website/src/jade/license/index.jade +++ /dev/null @@ -1,38 +0,0 @@ -include ../header.jade - -mixin LicenseOption(name, period, price, audience) - .item - h4 #{name} - - .focus #{period} - - span #{price} - - h5 Suggested for: - - span #{audience} - - a.button(href="/resources/pdf/spaCy_License_Agreement_2015.pdf", target="_blank") Download license - - span or #[a(href="mailto:sales@spacy.io") get in touch] - -- var Page = InitPage(Site, Authors.spacy, "license", "License") - -+WritePage(Site, Authors.spacy, Page) - article.pricing - .box.license - +LicenseOption("Trial", "90 days", "$0", "Evaluation") - +LicenseOption("Production", "1 year", "$5,000", "Production") - +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning") - - p.caption Researcher, hobbyist, or open-source developer? spaCy also offers #[a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3] licenses. - - blockquote.pull-quote - p Let's face it: Services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt. - - p You need the source, and you need to know you can buy a long-term license. So that's what we offer. The difference between this and a black-box API is night and day. - - p Let's face it: services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt. Open-source projects become abandoned or bloated. Google's graveyard is over-flowing – ditto for Yahoo!, Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset? - - p A 5 year license won't expire until 2020. spaCy will be with you for longer than most of your current staff. If that's still not enough, get in touch. We can surely work something out. -