# A class implementing the Porter stemmer, taking in a word and extrating # the root or stem. # Author: Anish Sachdeva # Source: https://github.com/anishLearnsToCode/porter-stemmer class PorterStemmer: def __init__(self): """The word is a buffer holding a word to be stemmed. The letters are in the range [start, offset ... offset + 1) ... ending at end.""" self.vowels = ('a', 'e', 'i', 'o', 'u') self.word = '' self.end = 0 self.start = 0 self.offset = 0 def is_vowel(self, letter): return letter in self.vowels def is_consonant(self, index): """:returns True if word[index] is a consonant.""" if self.is_vowel(self.word[index]): return False if self.word[index] == 'y': if index == self.start: return True else: return not self.is_consonant(index - 1) return True def m(self): """m() measures the number of consonant sequences between start and offset. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 .... """ n = 0 i = self.start while True: if i > self.offset: return n if not self.is_consonant(i): break i += 1 i += 1 while True: while True: if i > self.offset: return n if self.is_consonant(i): break i += 1 i += 1 n += 1 while True: if i > self.offset: return n if not self.is_consonant(i): break i += 1 i += 1 def contains_vowel(self): """:returns TRUE if the word contains a vowel in the range [start, offset]""" for i in range(self.start, self.offset + 1): if not self.is_consonant(i): return True return False def contains_double_consonant(self, j): """:returns TRUE if the word contain a double consonant in the range [offset, start]""" if j < (self.start + 1): return False if self.word[j] != self.word[j - 1]: return False return self.is_consonant(j) def is_of_form_cvc(self, i): """:returns TRUE for indices set {i-2, i-1, i} has the form consonant - vowel - consonant and also if the second c is not w,x or y. this is used when trying to restore an e at the end of a short e.g. cav(e), lov(e), hop(e), crim(e), but snow, box, tray. """ if i < (self.start + 2) or not self.is_consonant(i) or self.is_consonant(i - 1) or not self.is_consonant(i - 2): return 0 ch = self.word[i] if ch == 'w' or ch == 'x' or ch == 'y': return 0 return 1 def ends_with(self, s): """:returns TRUE when {start...end} ends with the string s.""" length = len(s) if s[length - 1] != self.word[self.end]: # tiny speed-up return False if length > (self.end - self.start + 1): return False if self.word[self.end - length + 1: self.end + 1] != s: return False self.offset = self.end - length return True def set_to(self, s): """sets [offset + 1, end] to the characters in the string s, readjusting end.""" length = len(s) self.word = self.word[:self.offset + 1] + s + self.word[self.offset + length + 1:] self.end = self.offset + length def replace_morpheme(self, s): """is a mapping function to change morphemes""" if self.m() > 0: self.set_to(s) def remove_plurals(self): """This is step 1 ab and gets rid of plurals and -ed or -ing. e.g. caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat feed -> feed agreed -> agree disabled -> disable matting -> mat mating -> mate meeting -> meet milling -> mill messing -> mess meetings -> meet """ if self.word[self.end] == 's': if self.ends_with("sses"): self.end = self.end - 2 elif self.ends_with("ies"): self.set_to("i") elif self.word[self.end - 1] != 's': self.end = self.end - 1 if self.ends_with("eed"): if self.m() > 0: self.end = self.end - 1 elif (self.ends_with("ed") or self.ends_with("ing")) and self.contains_vowel(): self.end = self.offset if self.ends_with("at"): self.set_to("ate") elif self.ends_with("bl"): self.set_to("ble") elif self.ends_with("iz"): self.set_to("ize") elif self.contains_double_consonant(self.end): self.end = self.end - 1 ch = self.word[self.end] if ch == 'l' or ch == 's' or ch == 'z': self.end = self.end + 1 elif self.m() == 1 and self.is_of_form_cvc(self.end): self.set_to("e") def terminal_y_to_i(self): """This defines step 1 c which turns terminal y to i when there is another vowel in the stem.""" if self.ends_with('y') and self.contains_vowel(): self.word = self.word[:self.end] + 'i' + self.word[self.end + 1:] def map_double_to_single_suffix(self): """Defines step 2 and maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the string before the suffix must give m() > 0. """ if self.word[self.end - 1] == 'a': if self.ends_with("ational"): self.replace_morpheme("ate") elif self.ends_with("tional"): self.replace_morpheme("tion") elif self.word[self.end - 1] == 'c': if self.ends_with("enci"): self.replace_morpheme("ence") elif self.ends_with("anci"): self.replace_morpheme("ance") elif self.word[self.end - 1] == 'e': if self.ends_with("izer"): self.replace_morpheme("ize") elif self.word[self.end - 1] == 'l': if self.ends_with("bli"): self.replace_morpheme("ble") # --DEPARTURE-- # To match the published algorithm, replace this phrase with # if self.ends("abli"): self.r("able") elif self.ends_with("alli"): self.replace_morpheme("al") elif self.ends_with("entli"): self.replace_morpheme("ent") elif self.ends_with("eli"): self.replace_morpheme("e") elif self.ends_with("ousli"): self.replace_morpheme("ous") elif self.word[self.end - 1] == 'o': if self.ends_with("ization"): self.replace_morpheme("ize") elif self.ends_with("ation"): self.replace_morpheme("ate") elif self.ends_with("ator"): self.replace_morpheme("ate") elif self.word[self.end - 1] == 's': if self.ends_with("alism"): self.replace_morpheme("al") elif self.ends_with("iveness"): self.replace_morpheme("ive") elif self.ends_with("fulness"): self.replace_morpheme("ful") elif self.ends_with("ousness"): self.replace_morpheme("ous") elif self.word[self.end - 1] == 't': if self.ends_with("aliti"): self.replace_morpheme("al") elif self.ends_with("iviti"): self.replace_morpheme("ive") elif self.ends_with("biliti"): self.replace_morpheme("ble") elif self.word[self.end - 1] == 'g': if self.ends_with("logi"): self.replace_morpheme("log") def step3(self): """step3() deals with -ic-, -full, -ness etc.""" if self.word[self.end] == 'e': if self.ends_with("icate"): self.replace_morpheme("ic") elif self.ends_with("ative"): self.replace_morpheme("") elif self.ends_with("alize"): self.replace_morpheme("al") elif self.word[self.end] == 'i': if self.ends_with("iciti"): self.replace_morpheme("ic") elif self.word[self.end] == 'l': if self.ends_with("ical"): self.replace_morpheme("ic") elif self.ends_with("ful"): self.replace_morpheme("") elif self.word[self.end] == 's': if self.ends_with("ness"): self.replace_morpheme("") def step4(self): """step4() takes off -ant, -ence etc., in context vcvc.""" if self.word[self.end - 1] == 'a': if self.ends_with("al"): pass else: return elif self.word[self.end - 1] == 'c': if self.ends_with("ance"): pass elif self.ends_with("ence"): pass else: return elif self.word[self.end - 1] == 'e': if self.ends_with("er"): pass else: return elif self.word[self.end - 1] == 'i': if self.ends_with("ic"): pass else: return elif self.word[self.end - 1] == 'l': if self.ends_with("able"): pass elif self.ends_with("ible"): pass else: return elif self.word[self.end - 1] == 'n': if self.ends_with("ant"): pass elif self.ends_with("ement"): pass elif self.ends_with("ment"): pass elif self.ends_with("ent"): pass else: return elif self.word[self.end - 1] == 'o': if self.ends_with("ion") and (self.word[self.offset] == 's' or self.word[self.offset] == 't'): pass elif self.ends_with("ou"): pass # takes care of -ous else: return elif self.word[self.end - 1] == 's': if self.ends_with("ism"): pass else: return elif self.word[self.end - 1] == 't': if self.ends_with("ate"): pass elif self.ends_with("iti"): pass else: return elif self.word[self.end - 1] == 'u': if self.ends_with("ous"): pass else: return elif self.word[self.end - 1] == 'v': if self.ends_with("ive"): pass else: return elif self.word[self.end - 1] == 'z': if self.ends_with("ize"): pass else: return else: return if self.m() > 1: self.end = self.offset def step5(self): """step5() removes a final -e if m() > 1, and changes -ll to -l if m > 1.""" self.offset = self.end if self.word[self.end] == 'e': a = self.m() if a > 1 or (a == 1 and not self.is_of_form_cvc(self.end - 1)): self.end = self.end - 1 if self.word[self.end] == 'l' and self.contains_double_consonant(self.end) and self.m() > 1: self.end = self.end - 1 def stem_document(self, document): result = [] for line in document.split('\n'): result.append(self.stem_sentence(line)) return '\n'.join(result) def alphabetic(self, word): return ''.join([letter if letter.isalpha() else '' for letter in word]) def stem_sentence(self, sentence): result = [] for word in sentence.split(): result.append(self.stem_word(word)) return ' '.join(result) # Remove non-alphanumerical characters def clean(self, word): if word == '': return '' res = "" for c in word: if c.isalpha(): res = res + c return res # Main function of the stemmer performing all the operations def stem_word(self, word): word = self.clean(word) if word == '': return '' self.word = word self.end = len(word) - 1 self.start = 0 self.remove_plurals() self.terminal_y_to_i() self.map_double_to_single_suffix() self.step3() self.step4() self.step5() return self.word[self.start: self.end + 1]