diff --git a/jellyfish.py b/jellyfish.py index 5e9bd61..d2ffd8d 100644 --- a/jellyfish.py +++ b/jellyfish.py @@ -1,6 +1,6 @@ import itertools import unicodedata - +from porter import Stemmer def _normalize(s): return unicodedata.normalize('NFKD', unicode(s)) @@ -426,3 +426,6 @@ def metaphone(s): i += 1 return ''.join(result).upper() + +def porter_stem(s): + return Stemmer(s).stem() diff --git a/porter-test.csv b/porter-test.csv index a2ec2b7..478cc9f 100644 --- a/porter-test.csv +++ b/porter-test.csv @@ -23528,4 +23528,4 @@ zodiac,zodiac zodiacs,zodiac zone,zone zounds,zound -zwagger,zwagger \ No newline at end of file +zwagger,zwagger diff --git a/porter.py b/porter.py new file mode 100644 index 0000000..7491bc8 --- /dev/null +++ b/porter.py @@ -0,0 +1,216 @@ +_s2_options = { + 'a': ((['a','t','i','o','n','a','l'], ['a','t','e']), + (['t','i','o','n','a','l'], ['t','i','o','n'])), + 'c': ((['e','n','c','i'], ['e','n','c','e']), + (['a','n','c','i'], ['a','n','c','e']), + ), + 'e': ((['i','z','e','r'], ['i','z','e']),), + 'l': ((['b','l','i'], ['b','l','e']), + (['a','l','l','i'], ['a','l']), + (['e','n','t','l','i'], ['e','n','t']), + (['e','l','i'], ['e']), + (['o','u','s','l','i'], ['o','u','s']), + ), + 'o': ((['i','z','a','t','i','o','n'], ['i','z','e']), + (['a','t','i','o','n'], ['a','t','e']), + (['a','t','o','r'], ['a','t','e']), + ), + 's': ((['a','l','i','s','m'], ['a','l']), + (['i','v','e','n','e','s','s'], ['i','v','e']), + (['f','u','l','n','e','s','s'], ['f','u','l']), + (['o','u','s','n','e','s','s'], ['o','u','s']), + ), + 't': ((['a','l','i','t','i'], ['a','l']), + (['i','v','i','t','i'], ['i','v','e']), + (['b','i','l','i','t','i'], ['b','l','e']), + ), + 'g': ((['l','o','g','i'], ['l','o','g']),), +} + + +_s3_options = { + 'e': ((['i','c','a','t','e'], ['i','c']), + (['a','t','i','v','e'], []), + (['a','l','i','z','e'], ['a','l']), + ), + 'i': ((['i','c','i','t','i'], ['i','c']),), + 'l': ((['i','c','a','l'], ['i','c']), + (['f','u','l'], []), + ), + 's': ((['n','e','s','s'], []),), +} + +_s4_endings = { + 'a': (['a','l'],), + 'c': (['a','n','c','e'], ['e','n','c','e']), + 'e': (['e','r'],), + 'i': (['i','c'],), + 'l': (['a','b','l','e'], ['i','b','l','e']), + 'n': (['a','n','t'], ['e','m','e','n','t'], ['m','e','n','t'], + ['e','n','t']), + # handle 'o' separately + 'i': (['i','s','m'],), + 't': (['a','t','e'], ['i','t','i']), + 'u': (['o','u','s'],), + 'v': (['i','v','e'],), + 'z': (['i','z','e'],), +} + +class Stemmer(object): + def __init__(self, b): + self.b = list(b) + self.k = len(b)-1 + self.j = 0 + + def cons(self, i): + """ True iff b[i] is a consonant """ + if self.b[i] in 'aeiou': + return False + elif self.b[i] == 'y': + return True if i == 0 else not self.cons(i-1) + return True + + def m(self): + n = i = 0 + while True: + if i > self.j: + return n + if not self.cons(i): + break + i += 1 + i += 1 + while True: + while True: + if i > self.j: + return n + if self.cons(i): + break + i += 1 + + i += 1 + n += 1 + + while True: + if i > self.j: + return n + if not self.cons(i): + break + i += 1 + i += 1 + + def vowel_in_stem(self): + """ True iff 0...j contains vowel """ + for i in xrange(0, self.j+1): + if not self.cons(i): + return True + return False + + def doublec(self, j): + """ True iff j, j-1 contains double consonant """ + if j < 1 or self.b[j] != self.b[j-1]: + return False + return self.cons(j) + + def cvc(self, i): + """ True iff i-2,i-1,i is consonent-vowel consonant + and if second c isn't w,x, or y. + used to restore e at end of short words like cave, love, hope, crime + """ + if (i < 2 or not self.cons(i) or self.cons(i-1) or not self.cons(i-2) + or self.b[i] in 'wxy'): + return False + return True + + def ends(self, s): + """ True iff 0...k ends with string s """ + return self.b[self.k-len(s)+1:self.k+1] == s + + def setto(self, s): + """ set j+1...k to string s, readjusting k """ + length = len(s) + self.b[self.j+1:self.j+1+length] = s + self.k = self.j + length + + def r(self, s): + if self.m() > 0: + self.setto(s) + + def step1ab(self): + if self.b[self.k] == 's': + if self.ends(['s','s','e','s']): + self.k -= 2 + elif self.ends(['i', 'e', 's']): + self.setto(['i']) + elif self.b[self.k-1] != 's': + self.k -= 1 + if self.ends(['e', 'e', 'd']): + if self.m() > 0: + self.k -= 1 + elif ((self.ends(['e', 'd']) or self.ends(['i', 'n', 'g'])) + and self.vowel_in_stem()): + self.k = self.j + if self.ends(['a', 't']): + self.setto(['a', 't', 'e']) + elif self.ends(['b', 'l']): + self.setto(['b', 'l', 'e']) + elif self.ends(['i', 'z']): + self.setto(['i', 'z', 'e']) + elif self.doublec(self.k): + self.k -= 1 + if self.b[self.k] in 'lsz': + self.k += 1 + elif self.m() == 1 and self.cvc(self.k): + self.setto(['e']) + + def step1c(self): + """ turn terminal y into i if there's a vowel in stem """ + if self.ends("y") and self.vowel_in_stem(): + self.b[self.k] = 'i' + + def step2and3(self): + for end, repl in _s2_options.get(self.b[self.k-1], []): + if self.ends(end): + self.r(repl) + break + for end, repl in _s3_options.get(self.b[self.k-1], []): + if self.ends(end): + self.r(repl) + break + + def step4(self): + ch = self.b[self.k-1] + + if ch == 'o': + if not ((self.ends(['i','o','n']) and self.b[self.j] in 'st') or + self.ends(['o','u']) + ): + return + return + else: + endings = _s4_endings.get(ch, []) + for end in endings: + if self.ends(end): + break + else: + return + + if self.m() > 1: + self.k = self.j + + def step5(self): + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or a == 1 and not self.cvc(self.k-1): + self.k -= 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k -= 1 + + def stem(self): + self.step1ab() + self.step1c() + self.step2and3() + self.step4() + self.step5() + + return ''.join(self.b[:self.k+1])