add semi-broken porter.py
This commit is contained in:
parent
948ab78a28
commit
4b6e9eb087
|
@ -1,6 +1,6 @@
|
|||
import itertools
|
||||
import unicodedata
|
||||
|
||||
from porter import Stemmer
|
||||
|
||||
def _normalize(s):
|
||||
return unicodedata.normalize('NFKD', unicode(s))
|
||||
|
@ -426,3 +426,6 @@ def metaphone(s):
|
|||
i += 1
|
||||
|
||||
return ''.join(result).upper()
|
||||
|
||||
def porter_stem(s):
|
||||
return Stemmer(s).stem()
|
||||
|
|
|
@ -23528,4 +23528,4 @@ zodiac,zodiac
|
|||
zodiacs,zodiac
|
||||
zone,zone
|
||||
zounds,zound
|
||||
zwagger,zwagger
|
||||
zwagger,zwagger
|
||||
|
|
|
|
@ -0,0 +1,216 @@
|
|||
_s2_options = {
|
||||
'a': ((['a','t','i','o','n','a','l'], ['a','t','e']),
|
||||
(['t','i','o','n','a','l'], ['t','i','o','n'])),
|
||||
'c': ((['e','n','c','i'], ['e','n','c','e']),
|
||||
(['a','n','c','i'], ['a','n','c','e']),
|
||||
),
|
||||
'e': ((['i','z','e','r'], ['i','z','e']),),
|
||||
'l': ((['b','l','i'], ['b','l','e']),
|
||||
(['a','l','l','i'], ['a','l']),
|
||||
(['e','n','t','l','i'], ['e','n','t']),
|
||||
(['e','l','i'], ['e']),
|
||||
(['o','u','s','l','i'], ['o','u','s']),
|
||||
),
|
||||
'o': ((['i','z','a','t','i','o','n'], ['i','z','e']),
|
||||
(['a','t','i','o','n'], ['a','t','e']),
|
||||
(['a','t','o','r'], ['a','t','e']),
|
||||
),
|
||||
's': ((['a','l','i','s','m'], ['a','l']),
|
||||
(['i','v','e','n','e','s','s'], ['i','v','e']),
|
||||
(['f','u','l','n','e','s','s'], ['f','u','l']),
|
||||
(['o','u','s','n','e','s','s'], ['o','u','s']),
|
||||
),
|
||||
't': ((['a','l','i','t','i'], ['a','l']),
|
||||
(['i','v','i','t','i'], ['i','v','e']),
|
||||
(['b','i','l','i','t','i'], ['b','l','e']),
|
||||
),
|
||||
'g': ((['l','o','g','i'], ['l','o','g']),),
|
||||
}
|
||||
|
||||
|
||||
_s3_options = {
|
||||
'e': ((['i','c','a','t','e'], ['i','c']),
|
||||
(['a','t','i','v','e'], []),
|
||||
(['a','l','i','z','e'], ['a','l']),
|
||||
),
|
||||
'i': ((['i','c','i','t','i'], ['i','c']),),
|
||||
'l': ((['i','c','a','l'], ['i','c']),
|
||||
(['f','u','l'], []),
|
||||
),
|
||||
's': ((['n','e','s','s'], []),),
|
||||
}
|
||||
|
||||
_s4_endings = {
|
||||
'a': (['a','l'],),
|
||||
'c': (['a','n','c','e'], ['e','n','c','e']),
|
||||
'e': (['e','r'],),
|
||||
'i': (['i','c'],),
|
||||
'l': (['a','b','l','e'], ['i','b','l','e']),
|
||||
'n': (['a','n','t'], ['e','m','e','n','t'], ['m','e','n','t'],
|
||||
['e','n','t']),
|
||||
# handle 'o' separately
|
||||
'i': (['i','s','m'],),
|
||||
't': (['a','t','e'], ['i','t','i']),
|
||||
'u': (['o','u','s'],),
|
||||
'v': (['i','v','e'],),
|
||||
'z': (['i','z','e'],),
|
||||
}
|
||||
|
||||
class Stemmer(object):
|
||||
def __init__(self, b):
|
||||
self.b = list(b)
|
||||
self.k = len(b)-1
|
||||
self.j = 0
|
||||
|
||||
def cons(self, i):
|
||||
""" True iff b[i] is a consonant """
|
||||
if self.b[i] in 'aeiou':
|
||||
return False
|
||||
elif self.b[i] == 'y':
|
||||
return True if i == 0 else not self.cons(i-1)
|
||||
return True
|
||||
|
||||
def m(self):
|
||||
n = i = 0
|
||||
while True:
|
||||
if i > self.j:
|
||||
return n
|
||||
if not self.cons(i):
|
||||
break
|
||||
i += 1
|
||||
i += 1
|
||||
while True:
|
||||
while True:
|
||||
if i > self.j:
|
||||
return n
|
||||
if self.cons(i):
|
||||
break
|
||||
i += 1
|
||||
|
||||
i += 1
|
||||
n += 1
|
||||
|
||||
while True:
|
||||
if i > self.j:
|
||||
return n
|
||||
if not self.cons(i):
|
||||
break
|
||||
i += 1
|
||||
i += 1
|
||||
|
||||
def vowel_in_stem(self):
|
||||
""" True iff 0...j contains vowel """
|
||||
for i in xrange(0, self.j+1):
|
||||
if not self.cons(i):
|
||||
return True
|
||||
return False
|
||||
|
||||
def doublec(self, j):
|
||||
""" True iff j, j-1 contains double consonant """
|
||||
if j < 1 or self.b[j] != self.b[j-1]:
|
||||
return False
|
||||
return self.cons(j)
|
||||
|
||||
def cvc(self, i):
|
||||
""" True iff i-2,i-1,i is consonent-vowel consonant
|
||||
and if second c isn't w,x, or y.
|
||||
used to restore e at end of short words like cave, love, hope, crime
|
||||
"""
|
||||
if (i < 2 or not self.cons(i) or self.cons(i-1) or not self.cons(i-2)
|
||||
or self.b[i] in 'wxy'):
|
||||
return False
|
||||
return True
|
||||
|
||||
def ends(self, s):
|
||||
""" True iff 0...k ends with string s """
|
||||
return self.b[self.k-len(s)+1:self.k+1] == s
|
||||
|
||||
def setto(self, s):
|
||||
""" set j+1...k to string s, readjusting k """
|
||||
length = len(s)
|
||||
self.b[self.j+1:self.j+1+length] = s
|
||||
self.k = self.j + length
|
||||
|
||||
def r(self, s):
|
||||
if self.m() > 0:
|
||||
self.setto(s)
|
||||
|
||||
def step1ab(self):
|
||||
if self.b[self.k] == 's':
|
||||
if self.ends(['s','s','e','s']):
|
||||
self.k -= 2
|
||||
elif self.ends(['i', 'e', 's']):
|
||||
self.setto(['i'])
|
||||
elif self.b[self.k-1] != 's':
|
||||
self.k -= 1
|
||||
if self.ends(['e', 'e', 'd']):
|
||||
if self.m() > 0:
|
||||
self.k -= 1
|
||||
elif ((self.ends(['e', 'd']) or self.ends(['i', 'n', 'g']))
|
||||
and self.vowel_in_stem()):
|
||||
self.k = self.j
|
||||
if self.ends(['a', 't']):
|
||||
self.setto(['a', 't', 'e'])
|
||||
elif self.ends(['b', 'l']):
|
||||
self.setto(['b', 'l', 'e'])
|
||||
elif self.ends(['i', 'z']):
|
||||
self.setto(['i', 'z', 'e'])
|
||||
elif self.doublec(self.k):
|
||||
self.k -= 1
|
||||
if self.b[self.k] in 'lsz':
|
||||
self.k += 1
|
||||
elif self.m() == 1 and self.cvc(self.k):
|
||||
self.setto(['e'])
|
||||
|
||||
def step1c(self):
|
||||
""" turn terminal y into i if there's a vowel in stem """
|
||||
if self.ends("y") and self.vowel_in_stem():
|
||||
self.b[self.k] = 'i'
|
||||
|
||||
def step2and3(self):
|
||||
for end, repl in _s2_options.get(self.b[self.k-1], []):
|
||||
if self.ends(end):
|
||||
self.r(repl)
|
||||
break
|
||||
for end, repl in _s3_options.get(self.b[self.k-1], []):
|
||||
if self.ends(end):
|
||||
self.r(repl)
|
||||
break
|
||||
|
||||
def step4(self):
|
||||
ch = self.b[self.k-1]
|
||||
|
||||
if ch == 'o':
|
||||
if not ((self.ends(['i','o','n']) and self.b[self.j] in 'st') or
|
||||
self.ends(['o','u'])
|
||||
):
|
||||
return
|
||||
return
|
||||
else:
|
||||
endings = _s4_endings.get(ch, [])
|
||||
for end in endings:
|
||||
if self.ends(end):
|
||||
break
|
||||
else:
|
||||
return
|
||||
|
||||
if self.m() > 1:
|
||||
self.k = self.j
|
||||
|
||||
def step5(self):
|
||||
self.j = self.k
|
||||
if self.b[self.k] == 'e':
|
||||
a = self.m()
|
||||
if a > 1 or a == 1 and not self.cvc(self.k-1):
|
||||
self.k -= 1
|
||||
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
||||
self.k -= 1
|
||||
|
||||
def stem(self):
|
||||
self.step1ab()
|
||||
self.step1c()
|
||||
self.step2and3()
|
||||
self.step4()
|
||||
self.step5()
|
||||
|
||||
return ''.join(self.b[:self.k+1])
|
Loading…
Reference in New Issue