import logging
from abc import ABC, abstractmethod
import nltk
import pycountry
from ..components._config import ENGLISH_LANGUAGE_CODE
logger = logging.getLogger(__name__)
[docs]class Stemmer(ABC):
def __init__(self):
"""Creates a Stemmer instance."""
[docs] @abstractmethod
def stem_word(self, word):
"""
Gets the stem of a word. For example, the stem of the word 'fishing' is 'fish'.
Args:
word (str): The word to stem
Returns:
stemmed_word (str): A stemmed version of the word
"""
raise NotImplementedError
[docs] def tojson(self):
"""
Method defined to obtain recursive JSON representation of a TextPreparationPipeline.
Args:
None.
Returns:
JSON representation of TextPreparationPipeline (dict) .
"""
return {self.__class__.__name__: None}
[docs]class NoOpStemmer(Stemmer):
[docs] def stem_word(self, word):
return word
[docs]class EnglishNLTKStemmer(Stemmer):
@property
def _stemmer(self):
# lazy init the stemmer
if not hasattr(self, "__stemmer"):
setattr(self, "__stemmer", nltk.stem.PorterStemmer())
return getattr(self, "__stemmer")
[docs] def stem_word(self, word):
stem = word.lower()
if (
self._stemmer.mode == self._stemmer.NLTK_EXTENSIONS
and word in self._stemmer.pool
):
return self._stemmer.pool[word]
if self._stemmer.mode != self._stemmer.ORIGINAL_ALGORITHM and len(word) <= 2:
# With this line, strings of length 1 or 2 don't go through
# the stemming process, although no mention is made of this
# in the published algorithm.
return word
stem = self._stemmer._step1a(stem)
stem = self._stemmer._step1b(stem)
stem = self._stemmer._step1c(stem)
stem = self._stemmer._step5b(stem)
return word if stem == "" else stem
[docs]class SnowballNLTKStemmer(Stemmer):
def __init__(self, language=None):
self.language = language
@property
def _stemmer(self):
# lazy init the stemmer
if not hasattr(self, "__stemmer"):
setattr(self, "__stemmer", nltk.stem.SnowballStemmer(self.language))
return getattr(self, "__stemmer")
[docs] def stem_word(self, word):
stem = word.lower()
stem = self._stemmer.stem(stem)
return word if stem == "" else stem
[docs]class StemmerFactory:
"""Stemmer Factory Class"""
[docs] @staticmethod
def get_stemmer(stemmer: str):
"""A static method to get a stemmer.
Args:
stemmer (str): Name of the desired translator class
Returns:
(Stemmer): Stemmer Class
"""
stemmer_classes = {
EnglishNLTKStemmer.__name__: EnglishNLTKStemmer,
SnowballNLTKStemmer.__name__: SnowballNLTKStemmer,
}
stemmer_class = stemmer_classes.get(stemmer)
if not stemmer_class:
raise TypeError(f"{stemmer} is not a valid Stemmer type.")
return stemmer_class()
[docs] @staticmethod
def get_stemmer_by_language(language_code):
if not language_code:
return NoOpStemmer()
language_code = language_code.lower()
if language_code == ENGLISH_LANGUAGE_CODE:
return EnglishNLTKStemmer()
language = StemmerFactory.get_language_from_language_code(language_code)
if not language:
logger.warning(
'Language code "%s" is not supported for stemming.', language_code
)
return NoOpStemmer()
language_name = language.name.lower()
if language_name in nltk.stem.SnowballStemmer.languages:
return SnowballNLTKStemmer(language_name)
logger.warning(
'Language code "%s" is not supported for stemming.', language_code
)
return NoOpStemmer()
[docs] @staticmethod
def get_language_from_language_code(language_code):
language = None
if len(language_code) == 2:
language = pycountry.languages.get(alpha_2=language_code)
elif len(language_code) == 3:
language = pycountry.languages.get(alpha_3=language_code)
return language