spaCy/lang_data/en/generate_specials.py

# -#- coding: utf-8 -*-
import json

contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}

# contains the lemmas, parts of speech, number, and tenspect of
# potential tokens generated after splitting contractions off
token_properties = {

            "ai": {"L": "be", "pos": "VBP", "number": 2},
            "are": {"L": "be", "pos": "VBP", "number": 2},
            "ca": {"L": "can", "pos": "MD"},
            "can": {"L": "can", "pos": "MD"},
            "could": {"pos": "MD"},                         # no lemma for could?
            "'d": {"L": "would", "pos": "MD"},
            "did": {"L": "do", "pos": "VBD"},
            "do": {"L": "do"},                              # no POS for do?
            "does": {"L": "do", "pos": "VBZ"},
            "had": {"L": "have", "pos": "VBD"},
            "has": {},                                      # no POS or lemma for has?
            "have": {"pos": "VB"},                          # no lemma for have?
            "he": {"L": "-PRON-"},                          # no POS for he?
            "how": {},                                      # no POS or lemma for how?
            "i": {"L": "-PRON-"},                           # no POS for i?
            "is": {"L": "be", "pos": "VBZ"},
            "it": {"L": "-PRON-"},                          # no POS for it?
            "let": {},                                      # no POS or lemma for let?
            "'ll": {"L": "will", "pos": "MD"},
            "'m": {"L": "be", "pos": "VBP", "number": 1, "tenspect": 1},
            "'ma": {},                                      # no POS or lemma for ma?
            "might": {},                                    # no POS or lemma for might?
            "must": {},                                     # no POS or lemma for must?
            "need": {},                                     # no POS or lemma for need?
            "not": {"L": "not", "pos": "RB"},
            "'nt": {"L": "not", "pos": "RB"},
            "n't": {"L": "not", "pos": "RB"},
            "'re": {},                                      # no POS or lemma for re?
            "'s": {},                                       # no POS or lemma for s?
            "sha": {},                                      # no POS or lemma for sha?
            "she": {"L": "-PRON-"},                         # no POS for she?
            "should": {},                                   # no POS or lemma for should?
            "that": {},                                     # no POS or lemma for that?
            "there": {},                                    # no POS or lemma for there?
            "they": {"L": "-PRON-"},                        # no POS for they?
            "was": {},                                      # no POS or lemma for was?
            "we": {},                                       # no POS or lemma for we?
            "were": {},                                     # no POS or lemma for were?
            "what": {},                                     # no POS or lemma for what?
            "when": {},                                     # no POS or lemma for when?
            "where": {},                                    # no POS or lemma for where?
            "who": {},                                      # no POS or lemma for who?
            "why": {},                                      # no POS or lemma for why?
            "wo": {},                                       # no POS or lemma for wo?
            "would": {},                                    # no POS or lemma for would?
            "you": {"L": "-PRON-"},                         # no POS or lemma for you?
            "'ve": {"L": "have", "pos": "VB"}

}

# contains starting tokens with their potential contractions
# each potential contraction has a list of exceptions
    # lower - don't generate the lowercase version
    # upper - don't generate the uppercase version
    # contrLower - don't generate the lowercase version with apostrophe (') removed
    # contrUpper - dont' generate the uppercase version with apostrophe (') removed
# for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so
# we add "contrLower" and "contrUpper" to the exceptions list
starting_tokens = {

                "ai": {"n't": []},
                "are": {"n't": []},
                "ca": {"n't": []},
                "can": {"not": []},
                "could": {"'ve": [], "n't": [], "n't've": []},
                "did": {"n't": []},
                "does": {"n't": []},
                "do": {"n't": []},
                "had": {"n't": [], "n't've": []},
                "has": {"n't": []},
                "have": {"n't": []},
                "he": {"'d": [], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
                "how": {"'d": [], "'ll": [], "'s": []},
                "i": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'m": [], "'ma": [], "'ve": []},
                "is": {"n't": []},
                "it": {"'d": [], "'d've": [], "'ll": [], "'s": ["contrLower", "contrUpper"]},
                "let": {"'s": ["contrLower", "contrUpper"]},
                "might": {"n't": [], "n't've": [], "'ve": []},
                "must": {"n't": [], "'ve": []},
                "need": {"n't": []},
                "not": {"'ve": []},
                "sha": {"n't": []},
                "she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
                "should": {"'ve": [], "n't": [], "n't've": []},
                "that": {"'s": []},
                "there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]},
                "they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},
                "was": {"n't": []},
                "we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},
                "were": {"n't": []},
                "what": {"'ll": [], "'re": [], "'s": [], "'ve": []},
                "when": {"'s": []},
                "where": {"'d": [], "'s": [], "'ve": []},
                "who": {"'d": [], "'ll": [], "'re": ["contrLower", "contrUpper"], "'s": [], "'ve": []},
                "why": {"'ll": [], "'re": [], "'s": []},
                "wo": {"n't": []},
                "would": {"'ve": [], "n't": [], "n't've": []},
                "you": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}

                }

# other specials that don't really have contractions
# so they are hardcoded
hardcoded_specials = {

                "'s":  [{"F": "'s", "L": "'s"}],

                "'S":  [{"F": "'S", "L": "'s"}],
                u"\u2018s": [{"F": u"\u2018s", "L": "'s"}],
                u"\u2018S": [{"F": u"\u2018S", "L": "'s"}],

                "'em": [{"F": "'em"}],

                "'ol": [{"F": "'ol"}],

                "vs.": [{"F": "vs."}],

                "Ms.": [{"F": "Ms."}],
                "Mr.": [{"F": "Mr."}],
                "Dr.": [{"F": "Dr."}],
                "Mrs.": [{"F": "Mrs."}],
                "Messrs.": [{"F": "Messrs."}],
                "Gov.": [{"F": "Gov."}],
                "Gen.": [{"F": "Gen."}],

                "Mt.": [{"F": "Mt.", "L": "Mount"}],

                "''": [{"F": "''"}],

                "—": [{"F": "—", "L": "--", "pos": ":"}],

                "Corp.": [{"F": "Corp."}],
                "Inc.": [{"F": "Inc."}],
                "Co.": [{"F": "Co."}],
                "co.": [{"F": "co."}],
                "Ltd.": [{"F": "Ltd."}],
                "Bros.": [{"F": "Bros."}],

                "Rep.": [{"F": "Rep."}],
                "Sen.": [{"F": "Sen."}],
                "Jr.": [{"F": "Jr."}],
                "Rev.": [{"F": "Rev."}],
                "Adm.": [{"F": "Adm."}],
                "St.": [{"F": "St."}],

                "a.m.": [{"F": "a.m."}],
                "p.m.": [{"F": "p.m."}],

                "1a.m.": [{"F": "1"}, {"F": "a.m."}],
                "2a.m.": [{"F": "2"}, {"F": "a.m."}],
                "3a.m.": [{"F": "3"}, {"F": "a.m."}],
                "4a.m.": [{"F": "4"}, {"F": "a.m."}],
                "5a.m.": [{"F": "5"}, {"F": "a.m."}],
                "6a.m.": [{"F": "6"}, {"F": "a.m."}],
                "7a.m.": [{"F": "7"}, {"F": "a.m."}],
                "8a.m.": [{"F": "8"}, {"F": "a.m."}],
                "9a.m.": [{"F": "9"}, {"F": "a.m."}],
                "10a.m.": [{"F": "10"}, {"F": "a.m."}],
                "11a.m.": [{"F": "11"}, {"F": "a.m."}],
                "12a.m.": [{"F": "12"}, {"F": "a.m."}],
                "1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
                "2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
                "3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
                "4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
                "5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
                "6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
                "7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
                "8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
                "9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
                "10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
                "11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
                "12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],


                "p.m.": [{"F": "p.m."}],
                "1p.m.": [{"F": "1"}, {"F": "p.m."}],
                "2p.m.": [{"F": "2"}, {"F": "p.m."}],
                "3p.m.": [{"F": "3"}, {"F": "p.m."}],
                "4p.m.": [{"F": "4"}, {"F": "p.m."}],
                "5p.m.": [{"F": "5"}, {"F": "p.m."}],
                "6p.m.": [{"F": "6"}, {"F": "p.m."}],
                "7p.m.": [{"F": "7"}, {"F": "p.m."}],
                "8p.m.": [{"F": "8"}, {"F": "p.m."}],
                "9p.m.": [{"F": "9"}, {"F": "p.m."}],
                "10p.m.": [{"F": "10"}, {"F": "p.m."}],
                "11p.m.": [{"F": "11"}, {"F": "p.m."}],
                "12p.m.": [{"F": "12"}, {"F": "p.m."}],
                "1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
                "2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
                "3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
                "4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
                "5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
                "6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
                "7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
                "8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
                "9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
                "10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
                "11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
                "12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],

                "Jan.": [{"F": "Jan."}],
                "Feb.": [{"F": "Feb."}],
                "Mar.": [{"F": "Mar."}],
                "Apr.": [{"F": "Apr."}],
                "May.": [{"F": "May."}],
                "Jun.": [{"F": "Jun."}],
                "Jul.": [{"F": "Jul."}],
                "Aug.": [{"F": "Aug."}],
                "Sep.": [{"F": "Sep."}],
                "Sept.": [{"F": "Sept."}],
                "Oct.": [{"F": "Oct."}],
                "Nov.": [{"F": "Nov."}],
                "Dec.": [{"F": "Dec."}],

                "Ala.": [{"F": "Ala."}],
                "Ariz.": [{"F": "Ariz."}],
                "Ark.": [{"F":  "Ark."}],
                "Calif.": [{"F": "Calif."}],
                "Colo.": [{"F": "Colo."}],
                "Conn.": [{"F": "Conn."}],
                "Del.": [{"F":  "Del."}],
                "D.C.": [{"F": "D.C."}],
                "Fla.": [{"F":  "Fla."}],
                "Ga.": [{"F": "Ga."}],
                "Ill.": [{"F": "Ill."}],
                "Ind.": [{"F": "Ind."}],
                "Kans.": [{"F": "Kans."}],
                "Kan.": [{"F": "Kan."}],
                "Ky.": [{"F": "Ky."}],
                "La.": [{"F": "La."}],
                "Md.": [{"F": "Md."}],
                "Mass.": [{"F": "Mass."}],
                "Mich.": [{"F": "Mich."}],
                "Minn.": [{"F": "Minn."}],
                "Miss.": [{"F": "Miss."}],
                "Mo.": [{"F": "Mo."}],
                "Mont.": [{"F": "Mont."}],
                "Nebr.": [{"F": "Nebr."}],
                "Neb.": [{"F": "Neb."}],
                "Nev.": [{"F":  "Nev."}],
                "N.H.": [{"F": "N.H."}],
                "N.J.": [{"F": "N.J."}],
                "N.M.": [{"F": "N.M."}],
                "N.Y.": [{"F": "N.Y."}],
                "N.C.": [{"F": "N.C."}],
                "N.D.": [{"F": "N.D."}],
                "Okla.": [{"F": "Okla."}],
                "Ore.": [{"F": "Ore."}],
                "Pa.": [{"F": "Pa."}],
                "Tenn.": [{"F": "Tenn."}],
                "Va.": [{"F": "Va."}],
                "Wash.": [{"F": "Wash."}],
                "Wis.": [{"F": "Wis."}],

                ":)":  [{"F": ":)"}],
                "<3":  [{"F": "<3"}],
                ";)":  [{"F": ";)"}],
                "(:":  [{"F": "(:"}],
                ":(":  [{"F": ":("}],
                "-_-": [{"F": "-_-"}],
                "=)":  [{"F": "=)"}],
                ":/":  [{"F": ":/"}],
                ":>":  [{"F": ":>"}],
                ";-)": [{"F": ";-)"}],
                ":Y":  [{"F": ":Y"}],
                ":P":  [{"F": ":P"}],
                ":-P": [{"F": ":-P"}],
                ":3":  [{"F": ":3"}],
                "=3":  [{"F": "=3"}],
                "xD":  [{"F": "xD"}],
                "^_^": [{"F": "^_^"}],
                "=]":  [{"F": "=]"}],
                "=D":  [{"F": "=D"}],
                "<333":    [{"F": "<333"}],
                ":))": [{"F": ":))"}],
                ":0":  [{"F": ":0"}],
                "-__-":    [{"F": "-__-"}],
                "xDD": [{"F": "xDD"}],
                "o_o": [{"F": "o_o"}],
                "o_O": [{"F": "o_O"}],
                "V_V": [{"F": "V_V"}],
                "=[[": [{"F": "=[["}],
                "<33": [{"F": "<33"}],
                ";p":  [{"F": ";p"}],
                ";D":  [{"F": ";D"}],
                ";-p": [{"F": ";-p"}],
                ";(":  [{"F": ";("}],
                ":p":  [{"F": ":p"}],
                ":]":  [{"F": ":]"}],
                ":O":  [{"F": ":O"}],
                ":-/": [{"F": ":-/"}],
                ":-)": [{"F": ":-)"}],
                ":(((":    [{"F": ":((("}],
                ":((": [{"F": ":(("}],
                ":')": [{"F": ":')"}],
                "(^_^)":   [{"F": "(^_^)"}],
                "(=":  [{"F": "(="}],
                "o.O": [{"F": "o.O"}],
                "\")": [{"F": "\")"}],
                "a.": [{"F": "a."}],
                "b.": [{"F": "b."}],
                "c.": [{"F": "c."}],
                "d.": [{"F": "d."}],
                "e.": [{"F": "e."}],
                "f.": [{"F": "f."}],
                "g.": [{"F": "g."}],
                "h.": [{"F": "h."}],
                "i.": [{"F": "i."}],
                "j.": [{"F": "j."}],
                "k.": [{"F": "k."}],
                "l.": [{"F": "l."}],
                "m.": [{"F": "m."}],
                "n.": [{"F": "n."}],
                "o.": [{"F": "o."}],
                "p.": [{"F": "p."}],
                "q.": [{"F": "q."}],
                "r.": [{"F": "r."}],
                "s.": [{"F": "s."}],
                "t.": [{"F": "t."}],
                "u.": [{"F": "u."}],
                "v.": [{"F": "v."}],
                "w.": [{"F": "w."}],
                "x.": [{"F": "x."}],
                "y.": [{"F": "y."}],
                "z.": [{"F": "z."}],

                "i.e.": [{"F": "i.e."}],
                "I.e.": [{"F": "I.e."}],
                "I.E.": [{"F": "I.E."}],
                "e.g.": [{"F": "e.g."}],
                "E.g.": [{"F": "E.g."}],
                "E.G.": [{"F": "E.G."}],
                "\n": [{"F": "\n", "pos": "SP"}],
                "\t": [{"F": "\t", "pos": "SP"}],
                " ": [{"F": " ", "pos": "SP"}]

}

def get_double_contractions(ending):
    endings = []

    ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])

    while ends_with_contraction:
        for contraction in contractions:
            if ending.endswith(contraction):
                endings.append(contraction)
                ending = ending.rstrip(contraction)
        ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])

    endings.reverse() # reverse because the last ending is put in the list first
    return endings

def get_token_properties(token, capitalize=False, remove_contractions=False):
    props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
    if capitalize:
        token = token.capitalize()
    if remove_contractions:
        token = token.replace("'", "")

    props["F"] = token
    return props

def create_entry(token, endings, capitalize=False, remove_contractions=False):

    properties = []
    properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
    for e in endings:
        properties.append(get_token_properties(e, remove_contractions=remove_contractions))
    return properties

def generate_specials():

    specials = {}

    for token in starting_tokens:
        possible_endings = starting_tokens[token]
        for ending in possible_endings:

            endings = []
            if ending.count("'") > 1:
                endings.extend(get_double_contractions(ending))
            else:
                endings.append(ending)

            exceptions = possible_endings[ending]

            if "lower" not in exceptions:
                special = token + ending
                specials[special] = create_entry(token, endings)

            if "upper" not in exceptions:
                special = token.capitalize() + ending
                specials[special] = create_entry(token, endings, capitalize=True)

            if "contrLower" not in exceptions:
                special = token + ending.replace("'", "")
                specials[special] = create_entry(token, endings, remove_contractions=True)

            if "contrUpper" not in exceptions:
                special = token.capitalize() + ending.replace("'", "")
                specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)

    # add in hardcoded specials
    specials = dict(specials, **hardcoded_specials)

    return specials

if __name__ == "__main__":
    specials = generate_specials()
    with open("specials.json", "w") as file_:
        file_.write(json.dumps(specials, indent=2))