spaCy/spacy/lang/entity_rules.py

206 lines
3.6 KiB
Python
Raw Normal View History

2017-03-12 12:07:28 +00:00
# coding: utf8
2016-12-18 14:36:09 +00:00
from __future__ import unicode_literals
from ..symbols import ORTH, ENT_TYPE, LOWER
2017-05-08 20:23:48 +00:00
ENT_ID = "ent_id"
2016-12-18 14:36:09 +00:00
ENTITY_RULES = [
{
ENT_ID: "Reddit",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "reddit"}]
]
},
{
ENT_ID: "Linux",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "linux"}]
]
},
{
ENT_ID: "Haskell",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "haskell"}],
]
},
{
ENT_ID: "HaskellCurry",
"attrs": {ENT_TYPE: "PERSON"},
"patterns": [
[{LOWER: "haskell"}, {LOWER: "curry"}]
]
},
{
ENT_ID: "Javascript",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "javascript"}],
]
},
{
ENT_ID: "CSS",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "css"}],
[{LOWER: "css3"}],
]
},
{
ENT_ID: "HTML",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "html"}],
[{LOWER: "html5"}],
]
},
{
ENT_ID: "Python",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{ORTH: "Python"}]
]
},
{
ENT_ID: "Ruby",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{ORTH: "Ruby"}]
]
},
{
ENT_ID: "spaCy",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "spacy"}]
]
},
{
ENT_ID: "displaCy",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "displacy"}]
]
},
{
ENT_ID: "Digg",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "digg"}]
]
},
{
ENT_ID: "FoxNews",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "foxnews"}],
[{LOWER: "fox"}, {LOWER: "news"}]
]
},
{
ENT_ID: "Google",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "google"}]
]
},
{
ENT_ID: "Mac",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "mac"}]
]
},
{
ENT_ID: "Wikipedia",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "wikipedia"}]
]
},
{
ENT_ID: "Windows",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{ORTH: "Windows"}]
]
},
{
ENT_ID: "Dell",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "dell"}]
]
},
{
ENT_ID: "Facebook",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{LOWER: "facebook"}]
]
},
{
ENT_ID: "Blizzard",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{ORTH: "Blizzard"}]
]
},
{
ENT_ID: "Ubuntu",
"attrs": {ENT_TYPE: "ORG"},
"patterns": [
[{ORTH: "Ubuntu"}]
]
},
{
ENT_ID: "YouTube",
"attrs": {ENT_TYPE: "PRODUCT"},
"patterns": [
[{LOWER: "youtube"}]
]
}
]
FALSE_POSITIVES = [
[{ORTH: "Shit"}],
[{ORTH: "Weed"}],
[{ORTH: "Cool"}],
[{ORTH: "Btw"}],
[{ORTH: "Bah"}],
[{ORTH: "Bullshit"}],
[{ORTH: "Lol"}],
[{ORTH: "Yo"}, {LOWER: "dawg"}],
[{ORTH: "Yay"}],
[{ORTH: "Ahh"}],
[{ORTH: "Yea"}],
[{ORTH: "Bah"}]
]