mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of ssh://github.com/honnibal/spaCy into develop
This commit is contained in:
commit
623329b19a
|
@ -0,0 +1,95 @@
|
|||
Syllogism Contributor Agreement
|
||||
===============================
|
||||
|
||||
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
||||
Agreement. The SCA applies to any contribution that you make to any product or
|
||||
project managed by us (the “project”), and sets out the intellectual property
|
||||
rights you grant to us in the contributed materials. The term “us” shall mean
|
||||
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
||||
If you agree to be bound by these terms, fill in the information requested below
|
||||
and include the filled-in version with your first pull-request, under the file
|
||||
contrbutors/. The name of the file should be your GitHub username, with the
|
||||
extension .md. For example, the user example_user would create the file
|
||||
spaCy/contributors/example_user.md .
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
1. The term 'contribution' or ‘contributed materials’ means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual, documentation,
|
||||
or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and registrations,
|
||||
in your contribution:
|
||||
* you hereby assign to us joint ownership, and to the extent that such assignment
|
||||
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
|
||||
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
|
||||
to exercise all rights under those copyrights. This includes, at our option, the
|
||||
right to sublicense these same rights to third parties through multiple levels of
|
||||
sublicensees or other licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your contribution
|
||||
as if each of us were the sole owners, and if one of us makes a derivative work
|
||||
of your contribution, the one who makes the derivative work (or has it made) will
|
||||
be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution against
|
||||
us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and exercise
|
||||
all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the consent
|
||||
of, pay or render an accounting to the other for any use or distribution of your
|
||||
contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
|
||||
worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
|
||||
contribution in whole or in part, alone or in combination with
|
||||
or included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through multiple
|
||||
levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective on
|
||||
the date you first submitted a contribution to us, even if your submission took
|
||||
place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of authorship
|
||||
and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any third
|
||||
party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and other
|
||||
applicable export and import laws. You agree to notify us if you become aware of
|
||||
any circumstance which would make any of the foregoing representations inaccurate
|
||||
in any respect. Syllogism Co. may publicly disclose your participation in the project,
|
||||
including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable U.S.
|
||||
Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
_x__ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
|
||||
|
||||
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | J Nicolas Schrading |
|
||||
| Company's name (if applicable) | |
|
||||
| Title or Role (if applicable) | |
|
||||
| Date | 2015-08-24 |
|
||||
| GitHub username | NSchrading |
|
||||
| Website (optional) | nicschrading.com |
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
Syllogism Contributor Agreement
|
||||
===============================
|
||||
|
||||
This Syllogism Contributor Agreement (“SCA”) is based on the Oracle Contributor
|
||||
Agreement. The SCA applies to any contribution that you make to any product or
|
||||
project managed by us (the “project”), and sets out the intellectual property
|
||||
rights you grant to us in the contributed materials. The term “us” shall mean
|
||||
Syllogism Co. The term "you" shall mean the person or entity identified below.
|
||||
If you agree to be bound by these terms, fill in the information requested below
|
||||
and include the filled-in version with your first pull-request, under the file
|
||||
contrbutors/. The name of the file should be your GitHub username, with the
|
||||
extension .md. For example, the user example_user would create the file
|
||||
spaCy/contributors/example_user.md .
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
1. The term 'contribution' or ‘contributed materials’ means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual, documentation,
|
||||
or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and registrations,
|
||||
in your contribution:
|
||||
* you hereby assign to us joint ownership, and to the extent that such assignment
|
||||
is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual,
|
||||
irrevocable, non-exclusive, worldwide, no-charge, royalty-free, unrestricted license
|
||||
to exercise all rights under those copyrights. This includes, at our option, the
|
||||
right to sublicense these same rights to third parties through multiple levels of
|
||||
sublicensees or other licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your contribution
|
||||
as if each of us were the sole owners, and if one of us makes a derivative work
|
||||
of your contribution, the one who makes the derivative work (or has it made) will
|
||||
be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution against
|
||||
us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and exercise
|
||||
all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the consent
|
||||
of, pay or render an accounting to the other for any use or distribution of your
|
||||
contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive,
|
||||
worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer your
|
||||
contribution in whole or in part, alone or in combination with
|
||||
or included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through multiple
|
||||
levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective on
|
||||
the date you first submitted a contribution to us, even if your submission took
|
||||
place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of authorship
|
||||
and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any third
|
||||
party's copyrights, trademarks, patents, or other intellectual property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and other
|
||||
applicable export and import laws. You agree to notify us if you become aware of
|
||||
any circumstance which would make any of the foregoing representations inaccurate
|
||||
in any respect. Syllogism Co. may publicly disclose your participation in the project,
|
||||
including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable U.S.
|
||||
Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
_x__ I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect my contributions.
|
||||
|
||||
____ I am signing on behalf of my employer or a legal entity and I have the actual authority to contractually bind that entity.
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Vsevolod Solovyov |
|
||||
| Company's name (if applicable) | |
|
||||
| Title or Role (if applicable) | |
|
||||
| Date | 2015-08-24 |
|
||||
| GitHub username | vsolovyov |
|
||||
| Website (optional) | |
|
||||
|
|
@ -0,0 +1,416 @@
|
|||
import json
|
||||
|
||||
contractions = {"n't", "'nt", "not", "'ve", "'d", "'ll", "'s", "'m", "'ma", "'re"}
|
||||
|
||||
# contains the lemmas, parts of speech, number, and tenspect of
|
||||
# potential tokens generated after splitting contractions off
|
||||
token_properties = {
|
||||
|
||||
"ai": {"L": "be", "pos": "VBP", "number": 2},
|
||||
"are": {"L": "be", "pos": "VBP", "number": 2},
|
||||
"ca": {"L": "can", "pos": "MD"},
|
||||
"can": {"L": "can", "pos": "MD"},
|
||||
"could": {"pos": "MD"}, # no lemma for could?
|
||||
"'d": {"L": "would", "pos": "MD"},
|
||||
"did": {"L": "do", "pos": "VBD"},
|
||||
"do": {"L": "do"}, # no POS for do?
|
||||
"does": {"L": "do", "pos": "VBZ"},
|
||||
"had": {"L": "have", "pos": "VBD"},
|
||||
"has": {}, # no POS or lemma for has?
|
||||
"have": {"pos": "VB"}, # no lemma for have?
|
||||
"he": {"L": "-PRON-"}, # no POS for he?
|
||||
"how": {}, # no POS or lemma for how?
|
||||
"i": {"L": "-PRON-"}, # no POS for i?
|
||||
"is": {"L": "be", "pos": "VBZ"},
|
||||
"it": {"L": "-PRON-"}, # no POS for it?
|
||||
"let": {}, # no POS or lemma for let?
|
||||
"'ll": {"L": "will", "pos": "MD"},
|
||||
"'m": {"L": "be", "pos": "VBP", "number": 1, "tenspect": 1},
|
||||
"'ma": {}, # no POS or lemma for ma?
|
||||
"might": {}, # no POS or lemma for might?
|
||||
"must": {}, # no POS or lemma for must?
|
||||
"need": {}, # no POS or lemma for need?
|
||||
"not": {"L": "not", "pos": "RB"},
|
||||
"'nt": {"L": "not", "pos": "RB"},
|
||||
"n't": {"L": "not", "pos": "RB"},
|
||||
"'re": {}, # no POS or lemma for re?
|
||||
"'s": {}, # no POS or lemma for s?
|
||||
"sha": {}, # no POS or lemma for sha?
|
||||
"she": {"L": "-PRON-"}, # no POS for she?
|
||||
"should": {}, # no POS or lemma for should?
|
||||
"that": {}, # no POS or lemma for that?
|
||||
"there": {}, # no POS or lemma for there?
|
||||
"they": {"L": "-PRON-"}, # no POS for they?
|
||||
"was": {}, # no POS or lemma for was?
|
||||
"we": {}, # no POS or lemma for we?
|
||||
"were": {}, # no POS or lemma for were?
|
||||
"what": {}, # no POS or lemma for what?
|
||||
"when": {}, # no POS or lemma for when?
|
||||
"where": {}, # no POS or lemma for where?
|
||||
"who": {}, # no POS or lemma for who?
|
||||
"why": {}, # no POS or lemma for why?
|
||||
"wo": {}, # no POS or lemma for wo?
|
||||
"would": {}, # no POS or lemma for would?
|
||||
"you": {"L": "-PRON-"}, # no POS or lemma for you?
|
||||
"'ve": {"L": "have", "pos": "VB"}
|
||||
|
||||
}
|
||||
|
||||
# contains starting tokens with their potential contractions
|
||||
# each potential contraction has a list of exceptions
|
||||
# lower - don't generate the lowercase version
|
||||
# upper - don't generate the uppercase version
|
||||
# contrLower - don't generate the lowercase version with apostrophe (') removed
|
||||
# contrUpper - dont' generate the uppercase version with apostrophe (') removed
|
||||
# for example, we don't want to create the word "hell" or "Hell" from "he" + "'ll" so
|
||||
# we add "contrLower" and "contrUpper" to the exceptions list
|
||||
starting_tokens = {
|
||||
|
||||
"ai": {"n't": []},
|
||||
"are": {"n't": []},
|
||||
"ca": {"n't": []},
|
||||
"can": {"not": []},
|
||||
"could": {"'ve": [], "n't": [], "n't've": []},
|
||||
"did": {"n't": []},
|
||||
"does": {"n't": []},
|
||||
"do": {"n't": []},
|
||||
"had": {"n't": [], "n't've": []},
|
||||
"has": {"n't": []},
|
||||
"have": {"n't": []},
|
||||
"he": {"'d": [], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
|
||||
"how": {"'d": [], "'ll": [], "'s": []},
|
||||
"i": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'m": [], "'ma": [], "'ve": []},
|
||||
"is": {"n't": []},
|
||||
"it": {"'d": [], "'d've": [], "'ll": [], "'s": ["contrLower", "contrUpper"]},
|
||||
"let": {"'s": ["contrLower", "contrUpper"]},
|
||||
"might": {"n't": [], "n't've": [], "'ve": []},
|
||||
"must": {"n't": [], "'ve": []},
|
||||
"need": {"n't": []},
|
||||
"not": {"'ve": []},
|
||||
"sha": {"n't": []},
|
||||
"she": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'s": []},
|
||||
"should": {"'ve": [], "n't": [], "n't've": []},
|
||||
"that": {"'s": []},
|
||||
"there": {"'d": [], "'d've": [], "'s": ["contrLower", "contrUpper"]},
|
||||
"they": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []},
|
||||
"was": {"n't": []},
|
||||
"we": {"'d": ["contrLower", "contrUpper"], "'d've": [], "'ll": ["contrLower", "contrUpper"], "'re": ["contrLower", "contrUpper"], "'ve": []},
|
||||
"were": {"n't": []},
|
||||
"what": {"'ll": [], "'re": [], "'s": [], "'ve": []},
|
||||
"when": {"'s": []},
|
||||
"where": {"'d": [], "'s": [], "'ve": []},
|
||||
"who": {"'d": [], "'ll": [], "'re": ["contrLower", "contrUpper"], "'s": [], "'ve": []},
|
||||
"why": {"'ll": [], "'re": [], "'s": []},
|
||||
"wo": {"n't": []},
|
||||
"would": {"'ve": [], "n't": [], "n't've": []},
|
||||
"you": {"'d": [], "'d've": [], "'ll": [], "'re": [], "'ve": []}
|
||||
|
||||
}
|
||||
|
||||
# other specials that don't really have contractions
|
||||
# so they are hardcoded
|
||||
hardcoded_specials = {
|
||||
|
||||
"'s": [{"F": "'s", "L": "'s"}],
|
||||
|
||||
"'S": [{"F": "'S", "L": "'s"}],
|
||||
|
||||
"'em": [{"F": "'em"}],
|
||||
|
||||
"'ol": [{"F": "'ol"}],
|
||||
|
||||
"vs.": [{"F": "vs."}],
|
||||
|
||||
"Ms.": [{"F": "Ms."}],
|
||||
"Mr.": [{"F": "Mr."}],
|
||||
"Dr.": [{"F": "Dr."}],
|
||||
"Mrs.": [{"F": "Mrs."}],
|
||||
"Messrs.": [{"F": "Messrs."}],
|
||||
"Gov.": [{"F": "Gov."}],
|
||||
"Gen.": [{"F": "Gen."}],
|
||||
|
||||
"Mt.": [{"F": "Mt.", "L": "Mount"}],
|
||||
|
||||
"''": [{"F": "''"}],
|
||||
|
||||
"Corp.": [{"F": "Corp."}],
|
||||
"Inc.": [{"F": "Inc."}],
|
||||
"Co.": [{"F": "Co."}],
|
||||
"co.": [{"F": "co."}],
|
||||
"Ltd.": [{"F": "Ltd."}],
|
||||
"Bros.": [{"F": "Bros."}],
|
||||
|
||||
"Rep.": [{"F": "Rep."}],
|
||||
"Sen.": [{"F": "Sen."}],
|
||||
"Jr.": [{"F": "Jr."}],
|
||||
"Rev.": [{"F": "Rev."}],
|
||||
"Adm.": [{"F": "Adm."}],
|
||||
"St.": [{"F": "St."}],
|
||||
|
||||
"a.m.": [{"F": "a.m."}],
|
||||
"p.m.": [{"F": "p.m."}],
|
||||
|
||||
"1a.m.": [{"F": "1"}, {"F": "a.m."}],
|
||||
"2a.m.": [{"F": "2"}, {"F": "a.m."}],
|
||||
"3a.m.": [{"F": "3"}, {"F": "a.m."}],
|
||||
"4a.m.": [{"F": "4"}, {"F": "a.m."}],
|
||||
"5a.m.": [{"F": "5"}, {"F": "a.m."}],
|
||||
"6a.m.": [{"F": "6"}, {"F": "a.m."}],
|
||||
"7a.m.": [{"F": "7"}, {"F": "a.m."}],
|
||||
"8a.m.": [{"F": "8"}, {"F": "a.m."}],
|
||||
"9a.m.": [{"F": "9"}, {"F": "a.m."}],
|
||||
"10a.m.": [{"F": "10"}, {"F": "a.m."}],
|
||||
"11a.m.": [{"F": "11"}, {"F": "a.m."}],
|
||||
"12a.m.": [{"F": "12"}, {"F": "a.m."}],
|
||||
"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}],
|
||||
"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}],
|
||||
"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}],
|
||||
"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}],
|
||||
"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}],
|
||||
"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}],
|
||||
"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}],
|
||||
"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}],
|
||||
"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}],
|
||||
"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}],
|
||||
"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}],
|
||||
"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}],
|
||||
|
||||
|
||||
"p.m.": [{"F": "p.m."}],
|
||||
"1p.m.": [{"F": "1"}, {"F": "p.m."}],
|
||||
"2p.m.": [{"F": "2"}, {"F": "p.m."}],
|
||||
"3p.m.": [{"F": "3"}, {"F": "p.m."}],
|
||||
"4p.m.": [{"F": "4"}, {"F": "p.m."}],
|
||||
"5p.m.": [{"F": "5"}, {"F": "p.m."}],
|
||||
"6p.m.": [{"F": "6"}, {"F": "p.m."}],
|
||||
"7p.m.": [{"F": "7"}, {"F": "p.m."}],
|
||||
"8p.m.": [{"F": "8"}, {"F": "p.m."}],
|
||||
"9p.m.": [{"F": "9"}, {"F": "p.m."}],
|
||||
"10p.m.": [{"F": "10"}, {"F": "p.m."}],
|
||||
"11p.m.": [{"F": "11"}, {"F": "p.m."}],
|
||||
"12p.m.": [{"F": "12"}, {"F": "p.m."}],
|
||||
"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}],
|
||||
"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}],
|
||||
"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}],
|
||||
"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}],
|
||||
"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}],
|
||||
"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}],
|
||||
"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}],
|
||||
"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}],
|
||||
"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}],
|
||||
"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}],
|
||||
"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}],
|
||||
"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}],
|
||||
|
||||
"Jan.": [{"F": "Jan."}],
|
||||
"Feb.": [{"F": "Feb."}],
|
||||
"Mar.": [{"F": "Mar."}],
|
||||
"Apr.": [{"F": "Apr."}],
|
||||
"May.": [{"F": "May."}],
|
||||
"Jun.": [{"F": "Jun."}],
|
||||
"Jul.": [{"F": "Jul."}],
|
||||
"Aug.": [{"F": "Aug."}],
|
||||
"Sep.": [{"F": "Sep."}],
|
||||
"Sept.": [{"F": "Sept."}],
|
||||
"Oct.": [{"F": "Oct."}],
|
||||
"Nov.": [{"F": "Nov."}],
|
||||
"Dec.": [{"F": "Dec."}],
|
||||
|
||||
"Ala.": [{"F": "Ala."}],
|
||||
"Ariz.": [{"F": "Ariz."}],
|
||||
"Ark.": [{"F": "Ark."}],
|
||||
"Calif.": [{"F": "Calif."}],
|
||||
"Colo.": [{"F": "Colo."}],
|
||||
"Conn.": [{"F": "Conn."}],
|
||||
"Del.": [{"F": "Del."}],
|
||||
"D.C.": [{"F": "D.C."}],
|
||||
"Fla.": [{"F": "Fla."}],
|
||||
"Ga.": [{"F": "Ga."}],
|
||||
"Ill.": [{"F": "Ill."}],
|
||||
"Ind.": [{"F": "Ind."}],
|
||||
"Kans.": [{"F": "Kans."}],
|
||||
"Kan.": [{"F": "Kan."}],
|
||||
"Ky.": [{"F": "Ky."}],
|
||||
"La.": [{"F": "La."}],
|
||||
"Md.": [{"F": "Md."}],
|
||||
"Mass.": [{"F": "Mass."}],
|
||||
"Mich.": [{"F": "Mich."}],
|
||||
"Minn.": [{"F": "Minn."}],
|
||||
"Miss.": [{"F": "Miss."}],
|
||||
"Mo.": [{"F": "Mo."}],
|
||||
"Mont.": [{"F": "Mont."}],
|
||||
"Nebr.": [{"F": "Nebr."}],
|
||||
"Neb.": [{"F": "Neb."}],
|
||||
"Nev.": [{"F": "Nev."}],
|
||||
"N.H.": [{"F": "N.H."}],
|
||||
"N.J.": [{"F": "N.J."}],
|
||||
"N.M.": [{"F": "N.M."}],
|
||||
"N.Y.": [{"F": "N.Y."}],
|
||||
"N.C.": [{"F": "N.C."}],
|
||||
"N.D.": [{"F": "N.D."}],
|
||||
"Okla.": [{"F": "Okla."}],
|
||||
"Ore.": [{"F": "Ore."}],
|
||||
"Pa.": [{"F": "Pa."}],
|
||||
"Tenn.": [{"F": "Tenn."}],
|
||||
"Va.": [{"F": "Va."}],
|
||||
"Wash.": [{"F": "Wash."}],
|
||||
"Wis.": [{"F": "Wis."}],
|
||||
|
||||
":)": [{"F": ":)"}],
|
||||
"<3": [{"F": "<3"}],
|
||||
";)": [{"F": ";)"}],
|
||||
"(:": [{"F": "(:"}],
|
||||
":(": [{"F": ":("}],
|
||||
"-_-": [{"F": "-_-"}],
|
||||
"=)": [{"F": "=)"}],
|
||||
":/": [{"F": ":/"}],
|
||||
":>": [{"F": ":>"}],
|
||||
";-)": [{"F": ";-)"}],
|
||||
":Y": [{"F": ":Y"}],
|
||||
":P": [{"F": ":P"}],
|
||||
":-P": [{"F": ":-P"}],
|
||||
":3": [{"F": ":3"}],
|
||||
"=3": [{"F": "=3"}],
|
||||
"xD": [{"F": "xD"}],
|
||||
"^_^": [{"F": "^_^"}],
|
||||
"=]": [{"F": "=]"}],
|
||||
"=D": [{"F": "=D"}],
|
||||
"<333": [{"F": "<333"}],
|
||||
":))": [{"F": ":))"}],
|
||||
":0": [{"F": ":0"}],
|
||||
"-__-": [{"F": "-__-"}],
|
||||
"xDD": [{"F": "xDD"}],
|
||||
"o_o": [{"F": "o_o"}],
|
||||
"o_O": [{"F": "o_O"}],
|
||||
"V_V": [{"F": "V_V"}],
|
||||
"=[[": [{"F": "=[["}],
|
||||
"<33": [{"F": "<33"}],
|
||||
";p": [{"F": ";p"}],
|
||||
";D": [{"F": ";D"}],
|
||||
";-p": [{"F": ";-p"}],
|
||||
";(": [{"F": ";("}],
|
||||
":p": [{"F": ":p"}],
|
||||
":]": [{"F": ":]"}],
|
||||
":O": [{"F": ":O"}],
|
||||
":-/": [{"F": ":-/"}],
|
||||
":-)": [{"F": ":-)"}],
|
||||
":(((": [{"F": ":((("}],
|
||||
":((": [{"F": ":(("}],
|
||||
":')": [{"F": ":')"}],
|
||||
"(^_^)": [{"F": "(^_^)"}],
|
||||
"(=": [{"F": "(="}],
|
||||
"o.O": [{"F": "o.O"}],
|
||||
"\")": [{"F": "\")"}],
|
||||
"a.": [{"F": "a."}],
|
||||
"b.": [{"F": "b."}],
|
||||
"c.": [{"F": "c."}],
|
||||
"d.": [{"F": "d."}],
|
||||
"e.": [{"F": "e."}],
|
||||
"f.": [{"F": "f."}],
|
||||
"g.": [{"F": "g."}],
|
||||
"h.": [{"F": "h."}],
|
||||
"i.": [{"F": "i."}],
|
||||
"j.": [{"F": "j."}],
|
||||
"k.": [{"F": "k."}],
|
||||
"l.": [{"F": "l."}],
|
||||
"m.": [{"F": "m."}],
|
||||
"n.": [{"F": "n."}],
|
||||
"o.": [{"F": "o."}],
|
||||
"p.": [{"F": "p."}],
|
||||
"q.": [{"F": "q."}],
|
||||
"s.": [{"F": "s."}],
|
||||
"t.": [{"F": "t."}],
|
||||
"u.": [{"F": "u."}],
|
||||
"v.": [{"F": "v."}],
|
||||
"w.": [{"F": "w."}],
|
||||
"x.": [{"F": "x."}],
|
||||
"y.": [{"F": "y."}],
|
||||
"z.": [{"F": "z."}],
|
||||
|
||||
"i.e.": [{"F": "i.e."}],
|
||||
"I.e.": [{"F": "I.e."}],
|
||||
"I.E.": [{"F": "I.E."}],
|
||||
"e.g.": [{"F": "e.g."}],
|
||||
"E.g.": [{"F": "E.g."}],
|
||||
"E.G.": [{"F": "E.G."}],
|
||||
"\n": [{"F": "\n", "pos": "SP"}],
|
||||
"\t": [{"F": "\t", "pos": "SP"}],
|
||||
" ": [{"F": " ", "pos": "SP"}]
|
||||
|
||||
}
|
||||
|
||||
def get_double_contractions(ending):
|
||||
endings = []
|
||||
|
||||
ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
|
||||
|
||||
while ends_with_contraction:
|
||||
for contraction in contractions:
|
||||
if ending.endswith(contraction):
|
||||
endings.append(contraction)
|
||||
ending = ending.rstrip(contraction)
|
||||
ends_with_contraction = any([ending.endswith(contraction) for contraction in contractions])
|
||||
|
||||
endings.reverse() # reverse because the last ending is put in the list first
|
||||
return endings
|
||||
|
||||
def get_token_properties(token, capitalize=False, remove_contractions=False):
|
||||
props = dict(token_properties.get(token)) # ensure we copy the dict so we can add the "F" prop
|
||||
if capitalize:
|
||||
token = token.capitalize()
|
||||
if remove_contractions:
|
||||
token = token.replace("'", "")
|
||||
|
||||
props["F"] = token
|
||||
return props
|
||||
|
||||
def create_entry(token, endings, capitalize=False, remove_contractions=False):
|
||||
|
||||
properties = []
|
||||
properties.append(get_token_properties(token, capitalize=capitalize, remove_contractions=remove_contractions))
|
||||
for e in endings:
|
||||
properties.append(get_token_properties(e, remove_contractions=remove_contractions))
|
||||
return properties
|
||||
|
||||
def generate_specials():
|
||||
|
||||
specials = {}
|
||||
|
||||
for token in starting_tokens:
|
||||
possible_endings = starting_tokens[token]
|
||||
for ending in possible_endings:
|
||||
|
||||
endings = []
|
||||
if ending.count("'") > 1:
|
||||
endings.extend(get_double_contractions(ending))
|
||||
else:
|
||||
endings.append(ending)
|
||||
|
||||
exceptions = possible_endings[ending]
|
||||
|
||||
if "lower" not in exceptions:
|
||||
special = token + ending
|
||||
specials[special] = create_entry(token, endings)
|
||||
|
||||
if "upper" not in exceptions:
|
||||
special = token.capitalize() + ending
|
||||
specials[special] = create_entry(token, endings, capitalize=True)
|
||||
|
||||
if "contrLower" not in exceptions:
|
||||
special = token + ending.replace("'", "")
|
||||
specials[special] = create_entry(token, endings, remove_contractions=True)
|
||||
|
||||
if "contrUpper" not in exceptions:
|
||||
special = token.capitalize() + ending.replace("'", "")
|
||||
specials[special] = create_entry(token, endings, capitalize=True, remove_contractions=True)
|
||||
|
||||
# add in hardcoded specials
|
||||
specials = dict(specials, **hardcoded_specials)
|
||||
|
||||
return specials
|
||||
|
||||
if __name__ == "__main__":
|
||||
specials = generate_specials()
|
||||
with open("specials.json", "w") as f:
|
||||
json.dump(specials, f)
|
||||
|
File diff suppressed because one or more lines are too long
2
setup.py
2
setup.py
|
@ -64,7 +64,7 @@ def name_to_path(mod_name, ext):
|
|||
def c_ext(mod_name, language, includes, compile_args, link_args):
|
||||
mod_path = name_to_path(mod_name, language)
|
||||
return Extension(mod_name, [mod_path], include_dirs=includes,
|
||||
extra_compile_args=compile_args, extra_link_args=compile_args)
|
||||
extra_compile_args=compile_args, extra_link_args=link_args)
|
||||
|
||||
|
||||
def cython_setup(mod_names, language, includes, compile_args, link_args):
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||
from libc.string cimport memset
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
import bz2
|
||||
from os import path
|
||||
|
@ -197,33 +198,56 @@ cdef class Vocab:
|
|||
for key, addr in self._by_hash.items():
|
||||
lexeme = <LexemeC*>addr
|
||||
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
|
||||
fp.write_from(lexeme, sizeof(LexemeC), 1)
|
||||
fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1)
|
||||
fp.write_from(&lexeme.id, sizeof(lexeme.flags), 1)
|
||||
fp.write_from(&lexeme.length, sizeof(lexeme.length), 1)
|
||||
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
|
||||
fp.write_from(&lexeme.lower, sizeof(lexeme.lower), 1)
|
||||
fp.write_from(&lexeme.norm, sizeof(lexeme.norm), 1)
|
||||
fp.write_from(&lexeme.shape, sizeof(lexeme.shape), 1)
|
||||
fp.write_from(&lexeme.prefix, sizeof(lexeme.prefix), 1)
|
||||
fp.write_from(&lexeme.suffix, sizeof(lexeme.suffix), 1)
|
||||
fp.write_from(&lexeme.cluster, sizeof(lexeme.cluster), 1)
|
||||
fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1)
|
||||
fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1)
|
||||
fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1)
|
||||
fp.close()
|
||||
|
||||
def load_lexemes(self, strings_loc, loc):
|
||||
self.strings.load(strings_loc)
|
||||
if not path.exists(loc):
|
||||
raise IOError('LexemeCs file not found at %s' % loc)
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
cdef FILE* fp = fopen(<char*>bytes_loc, b'rb')
|
||||
if fp == NULL:
|
||||
raise IOError('lexemes data file present, but cannot open from ' % loc)
|
||||
cdef size_t st
|
||||
fp = CFile(loc, 'rb')
|
||||
cdef LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
cdef hash_t key
|
||||
cdef unicode py_str
|
||||
cdef uint64_t bad_bytes
|
||||
i = 0
|
||||
while True:
|
||||
st = fread(&orth, sizeof(orth), 1, fp)
|
||||
if st != 1:
|
||||
break
|
||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
# Copies data from the file into the lexeme
|
||||
st = fread(lexeme, sizeof(LexemeC), 1, fp)
|
||||
lexeme.repvec = EMPTY_VEC
|
||||
if st != 1:
|
||||
try:
|
||||
fp.read_into(&orth, 1, sizeof(orth))
|
||||
except IOError:
|
||||
break
|
||||
# This 64 bit chunk is there for backwards compatibility. Remove on next release.
|
||||
fp.read_into(&bad_bytes, 1, sizeof(bad_bytes))
|
||||
# Copy data from the file into the lexeme
|
||||
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
|
||||
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
|
||||
fp.read_into(&lexeme.length, 1, sizeof(lexeme.length))
|
||||
fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth))
|
||||
fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower))
|
||||
fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm))
|
||||
fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape))
|
||||
fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix))
|
||||
fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix))
|
||||
fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster))
|
||||
fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob))
|
||||
fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment))
|
||||
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||
|
||||
lexeme.repvec = EMPTY_VEC
|
||||
if orth != lexeme.orth:
|
||||
# TODO: Improve this error message, pending resolution to Issue #64
|
||||
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
|
||||
|
@ -233,7 +257,7 @@ cdef class Vocab:
|
|||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
self.length += 1
|
||||
i += 1
|
||||
fclose(fp)
|
||||
fp.close()
|
||||
|
||||
def load_rep_vectors(self, loc):
|
||||
cdef CFile file_ = CFile(loc, b'rb')
|
||||
|
|
Loading…
Reference in New Issue