From 3e3ff99ca01f12bc955f55b04fc612601c22208c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 30 Aug 2014 19:01:00 +0200 Subject: [PATCH] * Add orth features --- spacy/orth.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 spacy/orth.py diff --git a/spacy/orth.py b/spacy/orth.py new file mode 100644 index 000000000..c574006c8 --- /dev/null +++ b/spacy/orth.py @@ -0,0 +1,71 @@ +# Binary string features +def is_alpha(string, prob, case_stats, tag_stats): + return False + +def is_digit(string, prob, case_stats, tag_stats): + return False + +def is_punct(string, prob, case_stats, tag_stats): + return False + +def is_space(string, prob, case_stats, tag_stats): + return False + +def is_ascii(string, prob, case_stats, tag_stats): + return False + +def is_title(string, prob, case_stats, tag_stats): + return False + +def is_lower(string, prob, case_stats, tag_stats): + return False + +def is_upper(string, prob, case_stats, tag_stats): + return False + + +# Statistics features +def oft_case(name, thresh): + def wrapped(string, prob, case_stats, tag_stats): + return string + return wrapped + + +def can_tag(name, thresh): + def wrapped(string, prob, case_stats, tag_stats): + return string + return wrapped + + +# String features +def canon_case(string, prob, cluster, case_stats, tag_stats): + return string + +def word_shape(string, *args): + length = len(string) + shape = "" + last = "" + shape_char = "" + seq = 0 + for c in string: + if c.isalpha(): + if c.isupper(): + shape_char = "X" + else: + shape_char = "x" + elif c.isdigit(): + shape_char = "d" + else: + shape_char = c + if shape_char == last: + seq += 1 + else: + seq = 0 + last = shape_char + if seq < 3: + shape += shape_char + return shape + + +def non_sparse(string, prob, cluster, case_stats, tag_stats): + return string