spaCy/lang_data/en/generate_specials.py

# -#- coding: utf-8 -*-
import json

contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}

# contains the lemmas, parts of speech, number, and tenspect of
# potential tokens generated after splitting contractions off
token_properties = { 

            "ai": {"L": "be", "pos": "VBP", "number": 2},
            "are": {"L": "be", "pos": "VBP", "number": 2},
            "ca": {"L": "can", "pos": "MD"},
            "can": {"L": "can", "pos": "MD"},
            "could": {"pos": "MD"},                         # no lemma for could?
            "'d": {"L": "would", "pos": "MD"},
            "did": {"L": "do", "pos": "VBD"},
            "do": {"L": "do"},                              # no POS for do?
            "does": {"L": "do", "pos": "VBZ"},
            "had": {"L": "have", "pos": "VBD"},
            "has": {},                                      # no POS or lemma for has?
            "have": {"pos": "VB"},                          # no lemma for have?
            "he": {"L": "-PRON-"},                          # no POS for he?
            "how": {},                                      # no POS or lemma for how?
            "i": {"L": "-PRON-"},                           # no POS for i?
            "is": {"L": "be", "pos": "VBZ"},
            "it": {"L": "-PRON-"},                          # no POS for it?
            "let": {},                                      # no POS or lemma for let?
            "'ll": {"L": "will", "pos": "MD"},
            "'m": {"L": "be", "pos": "VBP", "number": 1, "tenspect": 1},
            "'ma": {},                                      # no POS or lemma for ma?
            "might": {},                                    # no POS or lemma for might?
            "must": {},                                     # no POS or lemma for must?
            "need": {},                                     # no POS or lemma for need?
            "not": {"L": "not", "pos": "RB"},
            "'nt": {"L": "not", "pos": "RB"},
            "n't": {"L": "not", "pos": "RB"},
            "'re": {},                                      # no POS or lemma for re?
            "'s": {},                                       # no POS or lemma for s?
            "sha": {},                                      # no POS or lemma for sha?
            "she": {"L": "-PRON-"},                         # no POS for she?
            "should": {},                                   # no POS or lemma for should?
            "that": {},                                     # no POS or lemma for that?
            "there": {},                                    # no POS or lemma for there?
            "they": {"L": "-PRON-"},                        # no POS for they?
            "was": {},                                      # no POS or lemma for was?
            "we": {},                                       # no POS or lemma for we?
            "were": {},                                     # no POS or lemma for were?
            "what": {},                                     # no POS or lemma for what?
            "when": {},                                     # no POS or lemma for when?
            "where": {},                                    # no POS or lemma for where?
            "who": {},                                      # no POS or lemma for who?
            "why": {},                                      # no POS or lemma for why?
            "wo": {},                                       # no POS or lemma for wo?
            "would": {},                                    # no POS or lemma for would?
            "you": {"L": "-PRON-"},                         # no POS or lemma for you?
            "'ve": {"L": "have", "pos": "VB"}

}

# contains starting tokens with their potential contractions
# each potential contraction has a list of exceptions
    # lower - don't generate the lowercase version
    # upper - don't generate the uppercase version
    # contrLower - don't generate the lowercase version with apostrophe (') removed
    # contrUpper - dont' generate the uppercase version with apostrophe (') removed
# for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so 
# we add "contrLower" and "contrUpper" to the exceptions list
starting_tokens = {

                "ai": {"n't": []}, 
                "are": {"n't": []}, 
                "ca": {"n't": []},
                "can": {"not": []},
                "could": {"'ve": [], "n't": [], "n't've": []},
                "did": {"n't": []},
                "does": {"n't": []},
                "do": {"n't": []},
                "had": {"n't": [], "n't've": []},
                "has": {"n't": []},
                "have": {"n't": []},
                "he": {"'d": [], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
                "how": {"'d": [], "'ll": [], "'s": []},
                "i": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'m": [], "'ma": [], "'ve": []},
                "is": {"n't": []},
                "it": {"'d": [], "'d've": [], "'ll": [], "'s": ["contrLower", "contrUpper"]},
                "let": {"'s": ["contrLower", "contrUpper"]},
                "might": {"n't": [], "n't've": [], "'ve": []},
                "must": {"n't": [], "'ve": []},
                "need": {"n't": []},
                "not": {"'ve": []},
                "sha": {"n't": []},
                "she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
                "should": {"'ve": [], "n't": [], "n't've": []},
                "that": {"'s": []},
                "there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]},
                "they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},
                "was": {"n't": []},
                "we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},
                "were": {"n't": []},
                "what": {"'ll": [], "'re": [], "'s": [], "'ve": []},
                "when": {"'s": []},
                "where": {"'d": [], "'s": [], "'ve": []},
                "who": {"'d": [], "'ll": [], "'re": ["contrLower", "contrUpper"], "'s": [], "'ve": []},
                "why": {"'ll": [], "'re": [], "'s": []},
                "wo": {"n't": []},
                "would": {"'ve": [], "n't": [], "n't've": []},
                "you": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}

                }

# other specials that don't really have contractions
# so they are hardcoded
hardcoded_specials = {

                "'s":  [{"F": "'s", "L": "'s"}],

                "'S":  [{"F": "'S", "L": "'s"}],
                u"\u2018s": [{"F": u"\u2018s", "L": "'s"}],
                u"\u2018S": [{"F": u"\u2018S", "L": "'s"}],

                "'em": [{"F": "'em"}],

                "'ol": [{"F": "'ol"}],

                "vs.": [{"F": "vs."}],

                "Ms.": [{"F": "Ms."}],
                "Mr.": [{"F": "Mr."}],
                "Dr.": [{"F": "Dr."}],
                "Mrs.": [{"F": "Mrs."}],
                "Messrs.": [{"F": "Messrs."}],
                "Gov.": [{"F": "Gov."}],
                "Gen.": [{"F": "Gen."}],

                "Mt.": [{"F": "Mt.", "L": "Mount"}],

                "''": [{"F": "''"}],

                "—": [{"F": "—", "L": "--", "pos": ":"}],

                "Corp.": [{"F": "Corp."}],
                "Inc.": [{"F": "Inc."}],
                "Co.": [{"F": "Co."}],
                "co.": [{"F": "co."}],
                "Ltd.": [{"F": "Ltd."}],
                "Bros.": [{"F": "Bros."}],

                "Rep.": [{"F": "Rep."}],
                "Sen.": [{"F": "Sen."}],
                "Jr.": [{"F": "Jr."}],
                "Rev.": [{"F": "Rev."}],
                "Adm.": [{"F": "Adm."}],
                "St.": [{"F": "St."}],

                "a.m.": [{"F": "a.m."}],
                "p.m.": [{"F": "p.m."}],

                "1a.m.": [{"F": "1"}, {"F": "a.m."}],
                "2a.m.": [{"F": "2"}, {"F": "a.m."}],
                "3a.m.": [{"F": "3"}, {"F": "a.m."}],
                "4a.m.": [{"F": "4"}, {"F": "a.m."}],
                "5a.m.": [{"F": "5"}, {"F": "a.m."}],
                "6a.m.": [{"F": "6"}, {"F": "a.m."}],
                "7a.m.": [{"F": "7"}, {"F": "a.m."}],
                "8a.m.": [{"F": "8"}, {"F": "a.m."}],
                "9a.m.": [{"F": "9"}, {"F": "a.m."}],
                "10a.m.": [{"F": "10"}, {"F": "a.m."}],
                "11a.m.": [{"F": "11"}, {"F": "a.m."}],
                "12a.m.": [{"F": "12"}, {"F": "a.m."}],
                "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
                "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
                "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
                "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
                "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
                "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
                "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
                "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
                "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
                "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
                "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
                "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],


                "p.m.": [{"F": "p.m."}],
                "1p.m.": [{"F": "1"}, {"F": "p.m."}],
                "2p.m.": [{"F": "2"}, {"F": "p.m."}],
                "3p.m.": [{"F": "3"}, {"F": "p.m."}],
                "4p.m.": [{"F": "4"}, {"F": "p.m."}],
                "5p.m.": [{"F": "5"}, {"F": "p.m."}],
                "6p.m.": [{"F": "6"}, {"F": "p.m."}],
                "7p.m.": [{"F": "7"}, {"F": "p.m."}],
                "8p.m.": [{"F": "8"}, {"F": "p.m."}],
                "9p.m.": [{"F": "9"}, {"F": "p.m."}],
                "10p.m.": [{"F": "10"}, {"F": "p.m."}],
                "11p.m.": [{"F": "11"}, {"F": "p.m."}],
                "12p.m.": [{"F": "12"}, {"F": "p.m."}],
                "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
                "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
                "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
                "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
                "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
                "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
                "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
                "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
                "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
                "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
                "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
                "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],

                "Jan.": [{"F": "Jan."}],
                "Feb.": [{"F": "Feb."}],
                "Mar.": [{"F": "Mar."}],
                "Apr.": [{"F": "Apr."}],
                "May.": [{"F": "May."}],
                "Jun.": [{"F": "Jun."}],
                "Jul.": [{"F": "Jul."}],
                "Aug.": [{"F": "Aug."}],
                "Sep.": [{"F": "Sep."}],
                "Sept.": [{"F": "Sept."}],
                "Oct.": [{"F": "Oct."}],
                "Nov.": [{"F": "Nov."}],
                "Dec.": [{"F": "Dec."}],

                "Ala.": [{"F": "Ala."}],
                "Ariz.": [{"F": "Ariz."}],
                "Ark.": [{"F":  "Ark."}],
                "Calif.": [{"F": "Calif."}],
                "Colo.": [{"F": "Colo."}],
                "Conn.": [{"F": "Conn."}],
                "Del.": [{"F":  "Del."}],
                "D.C.": [{"F": "D.C."}],
                "Fla.": [{"F":  "Fla."}],
                "Ga.": [{"F": "Ga."}],
                "Ill.": [{"F": "Ill."}],
                "Ind.": [{"F": "Ind."}],
                "Kans.": [{"F": "Kans."}],
                "Kan.": [{"F": "Kan."}],
                "Ky.": [{"F": "Ky."}],
                "La.": [{"F": "La."}],
                "Md.": [{"F": "Md."}],
                "Mass.": [{"F": "Mass."}],
                "Mich.": [{"F": "Mich."}],
                "Minn.": [{"F": "Minn."}],
                "Miss.": [{"F": "Miss."}],
                "Mo.": [{"F": "Mo."}],
                "Mont.": [{"F": "Mont."}],
                "Nebr.": [{"F": "Nebr."}],
                "Neb.": [{"F": "Neb."}],
                "Nev.": [{"F":  "Nev."}],
                "N.H.": [{"F": "N.H."}],
                "N.J.": [{"F": "N.J."}],
                "N.M.": [{"F": "N.M."}],
                "N.Y.": [{"F": "N.Y."}],
                "N.C.": [{"F": "N.C."}],
                "N.D.": [{"F": "N.D."}],
                "Okla.": [{"F": "Okla."}],
                "Ore.": [{"F": "Ore."}],
                "Pa.": [{"F": "Pa."}],
                "Tenn.": [{"F": "Tenn."}],
                "Va.": [{"F": "Va."}],
                "Wash.": [{"F": "Wash."}],
                "Wis.": [{"F": "Wis."}],

                ":)":  [{"F": ":)"}],
                "<3":  [{"F": "<3"}],
                ";)":  [{"F": ";)"}],
                "(:":  [{"F": "(:"}],
                ":(":  [{"F": ":("}],
                "-_-": [{"F": "-_-"}],
                "=)":  [{"F": "=)"}],
                ":/":  [{"F": ":/"}],
                ":>":  [{"F": ":>"}],
                ";-)": [{"F": ";-)"}],
                ":Y":  [{"F": ":Y"}],
                ":P":  [{"F": ":P"}],
                ":-P": [{"F": ":-P"}],
                ":3":  [{"F": ":3"}],
                "=3":  [{"F": "=3"}],
                "xD":  [{"F": "xD"}],
                "^_^": [{"F": "^_^"}],
                "=]":  [{"F": "=]"}],
                "=D":  [{"F": "=D"}],
                "<333":    [{"F": "<333"}],
                ":))": [{"F": ":))"}],
                ":0":  [{"F": ":0"}],
                "-__-":    [{"F": "-__-"}],
                "xDD": [{"F": "xDD"}],
                "o_o": [{"F": "o_o"}],
                "o_O": [{"F": "o_O"}],
                "V_V": [{"F": "V_V"}],
                "=[[": [{"F": "=[["}],
                "<33": [{"F": "<33"}],
                ";p":  [{"F": ";p"}],
                ";D":  [{"F": ";D"}],
                ";-p": [{"F": ";-p"}],
                ";(":  [{"F": ";("}],
                ":p":  [{"F": ":p"}],
                ":]":  [{"F": ":]"}],
                ":O":  [{"F": ":O"}],
                ":-/": [{"F": ":-/"}],
                ":-)": [{"F": ":-)"}],
                ":(((":    [{"F": ":((("}],
                ":((": [{"F": ":(("}],
                ":')": [{"F": ":')"}],
                "(^_^)":   [{"F": "(^_^)"}],
                "(=":  [{"F": "(="}],
                "o.O": [{"F": "o.O"}],
                "\")": [{"F": "\")"}],
                "a.": [{"F": "a."}],
                "b.": [{"F": "b."}],
                "c.": [{"F": "c."}],
                "d.": [{"F": "d."}],
                "e.": [{"F": "e."}],
                "f.": [{"F": "f."}],
                "g.": [{"F": "g."}],
                "h.": [{"F": "h."}],
                "i.": [{"F": "i."}],
                "j.": [{"F": "j."}],
                "k.": [{"F": "k."}],
                "l.": [{"F": "l."}],
                "m.": [{"F": "m."}],
                "n.": [{"F": "n."}],
                "o.": [{"F": "o."}],
                "p.": [{"F": "p."}],
                "q.": [{"F": "q."}],
                "r.": [{"F": "r."}],
                "s.": [{"F": "s."}],
                "t.": [{"F": "t."}],
                "u.": [{"F": "u."}],
                "v.": [{"F": "v."}],
                "w.": [{"F": "w."}],
                "x.": [{"F": "x."}],
                "y.": [{"F": "y."}],
                "z.": [{"F": "z."}],

                "i.e.": [{"F": "i.e."}],
                "I.e.": [{"F": "I.e."}],
                "I.E.": [{"F": "I.E."}],
                "e.g.": [{"F": "e.g."}],
                "E.g.": [{"F": "E.g."}],
                "E.G.": [{"F": "E.G."}],
                "\n": [{"F": "\n", "pos": "SP"}],
                "\t": [{"F": "\t", "pos": "SP"}],
                " ": [{"F": " ", "pos": "SP"}],
                u"\xa0": [{"F": u"\xa0", "pos": "SP", "L": "  "}]

}

def get_double_contractions(ending):
    endings = []

    ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])

    while ends_with_contraction:
        for contraction in contractions:
            if ending.endswith(contraction):
                endings.append(contraction)
                ending = ending.rstrip(contraction)
        ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])

    endings.reverse() # reverse because the last ending is put in the list first
    return endings

def get_token_properties(token, capitalize=False, remove_contractions=False):
    props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
    if capitalize:
        token = token.capitalize()
    if remove_contractions:
        token = token.replace("'", "")

    props["F"] = token
    return props

def create_entry(token, endings, capitalize=False, remove_contractions=False):
    
    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties

def generate_specials():

    specials = {}

    for token in starting_tokens:
        possible_endings = starting_tokens[token]
        for ending in possible_endings:

            endings = []
            if ending.count("'") > 1:
                endings.extend(get_double_contractions(ending))
            else:
                endings.append(ending)

            exceptions = possible_endings[ending]

            if "lower" not in exceptions:
                special = token + ending
                specials[special] = create_entry(token, endings)

            if "upper" not in exceptions:
                special = token.capitalize() + ending
                specials[special] = create_entry(token, endings, capitalize=True)

            if "contrLower" not in exceptions:
                special = token + ending.replace("'", "")
                specials[special] = create_entry(token, endings, remove_contractions=True)

            if "contrUpper" not in exceptions:
                special = token.capitalize() + ending.replace("'", "")
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)

    # add in hardcoded specials
    specials = dict(specials, **hardcoded_specials)

    return specials

if __name__ == "__main__":
    specials = generate_specials()
    with open("specials.json", "w") as file_:
        file_.write(json.dumps(specials, indent=2))
* Make generate_specials pretty-print the specials.json file 2015-10-09 08:23:47 +00:00			`# -#- coding: utf-8 -*-`
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00			`import json`

			`contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}`

			`# contains the lemmas, parts of speech, number, and tenspect of`
			`# potential tokens generated after splitting contractions off`
			`token_properties = {`

			`"ai": {"L": "be", "pos": "VBP", "number": 2},`
			`"are": {"L": "be", "pos": "VBP", "number": 2},`
			`"ca": {"L": "can", "pos": "MD"},`
			`"can": {"L": "can", "pos": "MD"},`
			`"could": {"pos": "MD"}, # no lemma for could?`
			`"'d": {"L": "would", "pos": "MD"},`
			`"did": {"L": "do", "pos": "VBD"},`
			`"do": {"L": "do"}, # no POS for do?`
			`"does": {"L": "do", "pos": "VBZ"},`
			`"had": {"L": "have", "pos": "VBD"},`
			`"has": {}, # no POS or lemma for has?`
			`"have": {"pos": "VB"}, # no lemma for have?`
			`"he": {"L": "-PRON-"}, # no POS for he?`
			`"how": {}, # no POS or lemma for how?`
			`"i": {"L": "-PRON-"}, # no POS for i?`
			`"is": {"L": "be", "pos": "VBZ"},`
			`"it": {"L": "-PRON-"}, # no POS for it?`
			`"let": {}, # no POS or lemma for let?`
			`"'ll": {"L": "will", "pos": "MD"},`
			`"'m": {"L": "be", "pos": "VBP", "number": 1, "tenspect": 1},`
			`"'ma": {}, # no POS or lemma for ma?`
			`"might": {}, # no POS or lemma for might?`
			`"must": {}, # no POS or lemma for must?`
			`"need": {}, # no POS or lemma for need?`
			`"not": {"L": "not", "pos": "RB"},`
			`"'nt": {"L": "not", "pos": "RB"},`
			`"n't": {"L": "not", "pos": "RB"},`
			`"'re": {}, # no POS or lemma for re?`
			`"'s": {}, # no POS or lemma for s?`
			`"sha": {}, # no POS or lemma for sha?`
			`"she": {"L": "-PRON-"}, # no POS for she?`
			`"should": {}, # no POS or lemma for should?`
			`"that": {}, # no POS or lemma for that?`
			`"there": {}, # no POS or lemma for there?`
			`"they": {"L": "-PRON-"}, # no POS for they?`
			`"was": {}, # no POS or lemma for was?`
			`"we": {}, # no POS or lemma for we?`
			`"were": {}, # no POS or lemma for were?`
			`"what": {}, # no POS or lemma for what?`
			`"when": {}, # no POS or lemma for when?`
			`"where": {}, # no POS or lemma for where?`
			`"who": {}, # no POS or lemma for who?`
			`"why": {}, # no POS or lemma for why?`
			`"wo": {}, # no POS or lemma for wo?`
			`"would": {}, # no POS or lemma for would?`
			`"you": {"L": "-PRON-"}, # no POS or lemma for you?`
			`"'ve": {"L": "have", "pos": "VB"}`

			`}`

			`# contains starting tokens with their potential contractions`
			`# each potential contraction has a list of exceptions`
			`# lower - don't generate the lowercase version`
			`# upper - don't generate the uppercase version`
			`# contrLower - don't generate the lowercase version with apostrophe (') removed`
			`# contrUpper - dont' generate the uppercase version with apostrophe (') removed`
			`# for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so`
			`# we add "contrLower" and "contrUpper" to the exceptions list`
			`starting_tokens = {`

			`"ai": {"n't": []},`
			`"are": {"n't": []},`
			`"ca": {"n't": []},`
			`"can": {"not": []},`
			`"could": {"'ve": [], "n't": [], "n't've": []},`
			`"did": {"n't": []},`
			`"does": {"n't": []},`
			`"do": {"n't": []},`
			`"had": {"n't": [], "n't've": []},`
			`"has": {"n't": []},`
			`"have": {"n't": []},`
			`"he": {"'d": [], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},`
			`"how": {"'d": [], "'ll": [], "'s": []},`
			`"i": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'m": [], "'ma": [], "'ve": []},`
			`"is": {"n't": []},`
Add contributor agreement. Add exception to 'it' so that 'its' and 'Its' isn't generated (its =/= it's) 2015-08-24 22:20:06 +00:00			`"it": {"'d": [], "'d've": [], "'ll": [], "'s": ["contrLower", "contrUpper"]},`
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00			`"let": {"'s": ["contrLower", "contrUpper"]},`
			`"might": {"n't": [], "n't've": [], "'ve": []},`
			`"must": {"n't": [], "'ve": []},`
			`"need": {"n't": []},`
			`"not": {"'ve": []},`
			`"sha": {"n't": []},`
			`"she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},`
			`"should": {"'ve": [], "n't": [], "n't've": []},`
			`"that": {"'s": []},`
			`"there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]},`
			`"they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},`
			`"was": {"n't": []},`
Add missing we've and hardcoded 's and 'S 2015-08-22 02:57:47 +00:00			`"we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},`
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00			`"were": {"n't": []},`
			`"what": {"'ll": [], "'re": [], "'s": [], "'ve": []},`
			`"when": {"'s": []},`
			`"where": {"'d": [], "'s": [], "'ve": []},`
			`"who": {"'d": [], "'ll": [], "'re": ["contrLower", "contrUpper"], "'s": [], "'ve": []},`
			`"why": {"'ll": [], "'re": [], "'s": []},`
			`"wo": {"n't": []},`
			`"would": {"'ve": [], "n't": [], "n't've": []},`
			`"you": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}`

			`}`

			`# other specials that don't really have contractions`
			`# so they are hardcoded`
			`hardcoded_specials = {`
Add missing we've and hardcoded 's and 'S 2015-08-22 02:57:47 +00:00
			`"'s": [{"F": "'s", "L": "'s"}],`

			`"'S": [{"F": "'S", "L": "'s"}],`
* Add smart-quote possessive marker in generate_specials 2015-10-10 03:06:09 +00:00			`u"\u2018s": [{"F": u"\u2018s", "L": "'s"}],`
			`u"\u2018S": [{"F": u"\u2018S", "L": "'s"}],`
Add missing we've and hardcoded 's and 'S 2015-08-22 02:57:47 +00:00
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00			`"'em": [{"F": "'em"}],`

			`"'ol": [{"F": "'ol"}],`

			`"vs.": [{"F": "vs."}],`

			`"Ms.": [{"F": "Ms."}],`
			`"Mr.": [{"F": "Mr."}],`
			`"Dr.": [{"F": "Dr."}],`
			`"Mrs.": [{"F": "Mrs."}],`
			`"Messrs.": [{"F": "Messrs."}],`
			`"Gov.": [{"F": "Gov."}],`
			`"Gen.": [{"F": "Gen."}],`

			`"Mt.": [{"F": "Mt.", "L": "Mount"}],`

			`"''": [{"F": "''"}],`
* Add unicode em dash to specials.json, so that we can control what POS tag it gets. This way we can prevent sentence boundary detection errors, to address Issue #130. 2015-10-09 08:24:33 +00:00
* Fix pos tag in em-dash in specials 2015-10-09 09:06:37 +00:00			`"—": [{"F": "—", "L": "--", "pos": ":"}],`
* Add unicode em dash to specials.json, so that we can control what POS tag it gets. This way we can prevent sentence boundary detection errors, to address Issue #130. 2015-10-09 08:24:33 +00:00
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00			`"Corp.": [{"F": "Corp."}],`
			`"Inc.": [{"F": "Inc."}],`
			`"Co.": [{"F": "Co."}],`
			`"co.": [{"F": "co."}],`
			`"Ltd.": [{"F": "Ltd."}],`
			`"Bros.": [{"F": "Bros."}],`

			`"Rep.": [{"F": "Rep."}],`
			`"Sen.": [{"F": "Sen."}],`
			`"Jr.": [{"F": "Jr."}],`
			`"Rev.": [{"F": "Rev."}],`
			`"Adm.": [{"F": "Adm."}],`
			`"St.": [{"F": "St."}],`

			`"a.m.": [{"F": "a.m."}],`
			`"p.m.": [{"F": "p.m."}],`

			`"1a.m.": [{"F": "1"}, {"F": "a.m."}],`
			`"2a.m.": [{"F": "2"}, {"F": "a.m."}],`
			`"3a.m.": [{"F": "3"}, {"F": "a.m."}],`
			`"4a.m.": [{"F": "4"}, {"F": "a.m."}],`
			`"5a.m.": [{"F": "5"}, {"F": "a.m."}],`
			`"6a.m.": [{"F": "6"}, {"F": "a.m."}],`
			`"7a.m.": [{"F": "7"}, {"F": "a.m."}],`
			`"8a.m.": [{"F": "8"}, {"F": "a.m."}],`
			`"9a.m.": [{"F": "9"}, {"F": "a.m."}],`
			`"10a.m.": [{"F": "10"}, {"F": "a.m."}],`
			`"11a.m.": [{"F": "11"}, {"F": "a.m."}],`
			`"12a.m.": [{"F": "12"}, {"F": "a.m."}],`
			`"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],`
			`"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],`
			`"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],`
			`"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],`
			`"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],`
			`"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],`
			`"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],`
			`"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],`
			`"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],`
			`"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],`
			`"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],`
			`"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],`


			`"p.m.": [{"F": "p.m."}],`
			`"1p.m.": [{"F": "1"}, {"F": "p.m."}],`
			`"2p.m.": [{"F": "2"}, {"F": "p.m."}],`
			`"3p.m.": [{"F": "3"}, {"F": "p.m."}],`
			`"4p.m.": [{"F": "4"}, {"F": "p.m."}],`
			`"5p.m.": [{"F": "5"}, {"F": "p.m."}],`
			`"6p.m.": [{"F": "6"}, {"F": "p.m."}],`
			`"7p.m.": [{"F": "7"}, {"F": "p.m."}],`
			`"8p.m.": [{"F": "8"}, {"F": "p.m."}],`
			`"9p.m.": [{"F": "9"}, {"F": "p.m."}],`
			`"10p.m.": [{"F": "10"}, {"F": "p.m."}],`
			`"11p.m.": [{"F": "11"}, {"F": "p.m."}],`
			`"12p.m.": [{"F": "12"}, {"F": "p.m."}],`
			`"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],`
			`"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],`
			`"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],`
			`"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],`
			`"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],`
			`"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],`
			`"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],`
			`"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],`
			`"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],`
			`"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],`
			`"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],`
			`"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],`

			`"Jan.": [{"F": "Jan."}],`
			`"Feb.": [{"F": "Feb."}],`
			`"Mar.": [{"F": "Mar."}],`
			`"Apr.": [{"F": "Apr."}],`
			`"May.": [{"F": "May."}],`
			`"Jun.": [{"F": "Jun."}],`
			`"Jul.": [{"F": "Jul."}],`
			`"Aug.": [{"F": "Aug."}],`
			`"Sep.": [{"F": "Sep."}],`
			`"Sept.": [{"F": "Sept."}],`
			`"Oct.": [{"F": "Oct."}],`
			`"Nov.": [{"F": "Nov."}],`
			`"Dec.": [{"F": "Dec."}],`

			`"Ala.": [{"F": "Ala."}],`
			`"Ariz.": [{"F": "Ariz."}],`
			`"Ark.": [{"F": "Ark."}],`
			`"Calif.": [{"F": "Calif."}],`
			`"Colo.": [{"F": "Colo."}],`
			`"Conn.": [{"F": "Conn."}],`
			`"Del.": [{"F": "Del."}],`
			`"D.C.": [{"F": "D.C."}],`
			`"Fla.": [{"F": "Fla."}],`
			`"Ga.": [{"F": "Ga."}],`
			`"Ill.": [{"F": "Ill."}],`
			`"Ind.": [{"F": "Ind."}],`
			`"Kans.": [{"F": "Kans."}],`
			`"Kan.": [{"F": "Kan."}],`
			`"Ky.": [{"F": "Ky."}],`
			`"La.": [{"F": "La."}],`
			`"Md.": [{"F": "Md."}],`
			`"Mass.": [{"F": "Mass."}],`
			`"Mich.": [{"F": "Mich."}],`
			`"Minn.": [{"F": "Minn."}],`
			`"Miss.": [{"F": "Miss."}],`
			`"Mo.": [{"F": "Mo."}],`
			`"Mont.": [{"F": "Mont."}],`
			`"Nebr.": [{"F": "Nebr."}],`
			`"Neb.": [{"F": "Neb."}],`
			`"Nev.": [{"F": "Nev."}],`
			`"N.H.": [{"F": "N.H."}],`
			`"N.J.": [{"F": "N.J."}],`
			`"N.M.": [{"F": "N.M."}],`
			`"N.Y.": [{"F": "N.Y."}],`
			`"N.C.": [{"F": "N.C."}],`
			`"N.D.": [{"F": "N.D."}],`
			`"Okla.": [{"F": "Okla."}],`
			`"Ore.": [{"F": "Ore."}],`
			`"Pa.": [{"F": "Pa."}],`
			`"Tenn.": [{"F": "Tenn."}],`
			`"Va.": [{"F": "Va."}],`
			`"Wash.": [{"F": "Wash."}],`
			`"Wis.": [{"F": "Wis."}],`

			`":)": [{"F": ":)"}],`
			`"<3": [{"F": "<3"}],`
			`";)": [{"F": ";)"}],`
			`"(:": [{"F": "(:"}],`
			`":(": [{"F": ":("}],`
			`"-_-": [{"F": "-_-"}],`
			`"=)": [{"F": "=)"}],`
			`":/": [{"F": ":/"}],`
			`":>": [{"F": ":>"}],`
			`";-)": [{"F": ";-)"}],`
			`":Y": [{"F": ":Y"}],`
			`":P": [{"F": ":P"}],`
			`":-P": [{"F": ":-P"}],`
			`":3": [{"F": ":3"}],`
			`"=3": [{"F": "=3"}],`
			`"xD": [{"F": "xD"}],`
			`"^_^": [{"F": "^_^"}],`
			`"=]": [{"F": "=]"}],`
			`"=D": [{"F": "=D"}],`
			`"<333": [{"F": "<333"}],`
			`":))": [{"F": ":))"}],`
			`":0": [{"F": ":0"}],`
			`"-__-": [{"F": "-__-"}],`
			`"xDD": [{"F": "xDD"}],`
			`"o_o": [{"F": "o_o"}],`
			`"o_O": [{"F": "o_O"}],`
			`"V_V": [{"F": "V_V"}],`
			`"=[[": [{"F": "=[["}],`
			`"<33": [{"F": "<33"}],`
			`";p": [{"F": ";p"}],`
			`";D": [{"F": ";D"}],`
			`";-p": [{"F": ";-p"}],`
			`";(": [{"F": ";("}],`
			`":p": [{"F": ":p"}],`
			`":]": [{"F": ":]"}],`
			`":O": [{"F": ":O"}],`
			`":-/": [{"F": ":-/"}],`
			`":-)": [{"F": ":-)"}],`
			`":(((": [{"F": ":((("}],`
			`":((": [{"F": ":(("}],`
			`":')": [{"F": ":')"}],`
			`"(^_^)": [{"F": "(^_^)"}],`
			`"(=": [{"F": "(="}],`
			`"o.O": [{"F": "o.O"}],`
			`"\")": [{"F": "\")"}],`
			`"a.": [{"F": "a."}],`
			`"b.": [{"F": "b."}],`
			`"c.": [{"F": "c."}],`
			`"d.": [{"F": "d."}],`
			`"e.": [{"F": "e."}],`
			`"f.": [{"F": "f."}],`
			`"g.": [{"F": "g."}],`
			`"h.": [{"F": "h."}],`
			`"i.": [{"F": "i."}],`
			`"j.": [{"F": "j."}],`
			`"k.": [{"F": "k."}],`
			`"l.": [{"F": "l."}],`
			`"m.": [{"F": "m."}],`
			`"n.": [{"F": "n."}],`
			`"o.": [{"F": "o."}],`
			`"p.": [{"F": "p."}],`
			`"q.": [{"F": "q."}],`
add overseen (?) char 2015-09-22 10:29:47 +00:00			`"r.": [{"F": "r."}],`
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00			`"s.": [{"F": "s."}],`
			`"t.": [{"F": "t."}],`
			`"u.": [{"F": "u."}],`
			`"v.": [{"F": "v."}],`
			`"w.": [{"F": "w."}],`
			`"x.": [{"F": "x."}],`
			`"y.": [{"F": "y."}],`
			`"z.": [{"F": "z."}],`

			`"i.e.": [{"F": "i.e."}],`
			`"I.e.": [{"F": "I.e."}],`
			`"I.E.": [{"F": "I.E."}],`
			`"e.g.": [{"F": "e.g."}],`
			`"E.g.": [{"F": "E.g."}],`
			`"E.G.": [{"F": "E.G."}],`
			`"\n": [{"F": "\n", "pos": "SP"}],`
			`"\t": [{"F": "\t", "pos": "SP"}],`
* Add non-breaking space to specials.json 2015-10-10 04:54:06 +00:00			`" ": [{"F": " ", "pos": "SP"}],`
			`u"\xa0": [{"F": u"\xa0", "pos": "SP", "L": " "}]`
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00
			`}`

			`def get_double_contractions(ending):`
			`endings = []`

			`ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])`

			`while ends_with_contraction:`
			`for contraction in contractions:`
			`if ending.endswith(contraction):`
			`endings.append(contraction)`
			`ending = ending.rstrip(contraction)`
			`ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])`

			`endings.reverse() # reverse because the last ending is put in the list first`
			`return endings`

			`def get_token_properties(token, capitalize=False, remove_contractions=False):`
			`props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop`
			`if capitalize:`
			`token = token.capitalize()`
			`if remove_contractions:`
			`token = token.replace("'", "")`

			`props["F"] = token`
			`return props`

			`def create_entry(token, endings, capitalize=False, remove_contractions=False):`

			`properties = []`
			`properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))`
			`for e in endings:`
			`properties.append(get_token_properties(e, remove_contractions=remove_contractions))`
			`return properties`

			`def generate_specials():`

			`specials = {}`

			`for token in starting_tokens:`
			`possible_endings = starting_tokens[token]`
			`for ending in possible_endings:`

			`endings = []`
			`if ending.count("'") > 1:`
			`endings.extend(get_double_contractions(ending))`
			`else:`
			`endings.append(ending)`

			`exceptions = possible_endings[ending]`

			`if "lower" not in exceptions:`
			`special = token + ending`
			`specials[special] = create_entry(token, endings)`

			`if "upper" not in exceptions:`
			`special = token.capitalize() + ending`
			`specials[special] = create_entry(token, endings, capitalize=True)`

			`if "contrLower" not in exceptions:`
			`special = token + ending.replace("'", "")`
			`specials[special] = create_entry(token, endings, remove_contractions=True)`

			`if "contrUpper" not in exceptions:`
			`special = token.capitalize() + ending.replace("'", "")`
			`specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)`

			`# add in hardcoded specials`
			`specials = dict(specials, **hardcoded_specials)`

			`return specials`

			`if __name__ == "__main__":`
			`specials = generate_specials()`
* Make generate_specials pretty-print the specials.json file 2015-10-09 08:23:47 +00:00			`with open("specials.json", "w") as file_:`
			`file_.write(json.dumps(specials, indent=2))`
Add a script to generate the specials.json file, to take care of handling uppercase and missing apostrophe contractions 2015-08-22 02:39:33 +00:00