diff --git a/spacy/language_data/__init__.py b/spacy/language_data/__init__.py index c8109a51e..3aba785c2 100644 --- a/spacy/language_data/__init__.py +++ b/spacy/language_data/__init__.py @@ -1,3 +1,4 @@ from .emoticons import * from .punctuation import * +from .entity_rules import * from .util import * diff --git a/spacy/language_data/entity_rules.py b/spacy/language_data/entity_rules.py new file mode 100644 index 000000000..4217ecfcf --- /dev/null +++ b/spacy/language_data/entity_rules.py @@ -0,0 +1,206 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from .util import ENT_ID + + +ENTITY_RULES = [ + { + ENT_ID: "Reddit", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "reddit"}] + ] + }, + + { + ENT_ID: "Linux", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "linux"}] + ] + }, + + { + ENT_ID: "Haskell", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "haskell"}], + ] + }, + + { + ENT_ID: "HaskellCurry", + "attrs": {ENT_TYPE: "PERSON"}, + "patterns": [ + [{LOWER: "haskell"}, {LOWER: "curry"}] + ] + }, + + { + ENT_ID: "Javascript", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "javascript"}], + ] + }, + + { + ENT_ID: "CSS", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "css"}], + [{LOWER: "css3"}], + ] + }, + + { + ENT_ID: "HTML", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "html"}], + [{LOWER: "html5"}], + ] + }, + + { + ENT_ID: "Python", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{ORTH: "Python"}] + ] + }, + + { + ENT_ID: "Ruby", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{ORTH: "Ruby"}] + ] + }, + + { + ENT_ID: "spaCy", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "spacy"}] + ] + }, + + { + ENT_ID: "displaCy", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "displacy"}] + ] + }, + + { + ENT_ID: "Digg", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "digg"}] + ] + }, + + { + ENT_ID: "FoxNews", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "foxnews"}], + [{LOWER: "fox"}, {LOWER: "news"}] + ] + }, + + { + ENT_ID: "Google", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "google"}] + ] + }, + + { + ENT_ID: "Mac", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "mac"}] + ] + }, + + { + ENT_ID: "Wikipedia", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "wikipedia"}] + ] + }, + + { + ENT_ID: "Windows", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{ORTH: "Windows"}] + ] + }, + + { + ENT_ID: "Dell", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "dell"}] + ] + }, + + { + ENT_ID: "Facebook", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{LOWER: "facebook"}] + ] + }, + + { + ENT_ID: "Blizzard", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{ORTH: "Blizzard"}] + ] + }, + + { + ENT_ID: "Ubuntu", + "attrs": {ENT_TYPE: "ORG"}, + "patterns": [ + [{ORTH: "Ubuntu"}] + ] + }, + + { + ENT_ID: "YouTube", + "attrs": {ENT_TYPE: "PRODUCT"}, + "patterns": [ + [{LOWER: "youtube"}] + ] + } +] + + +FALSE_POSITIVES = [ + [{ORTH: "Shit"}], + [{ORTH: "Weed"}], + [{ORTH: "Cool"}], + [{ORTH: "Btw"}], + [{ORTH: "Bah"}], + [{ORTH: "Bullshit"}], + [{ORTH: "Lol"}], + [{ORTH: "Yo"}, {LOWER: "dawg"}], + [{ORTH: "Yay"}], + [{ORTH: "Ahh"}], + [{ORTH: "Yea"}], + [{ORTH: "Bah"}] +] + + +__all__ = ["ENTITY_RULES", "FALSE_POSITIVES"]