Spaces:
Sleeping
Sleeping
File size: 6,080 Bytes
dbaa71b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import logging
import re
import string
from abc import abstractmethod
from typing import Any, List, Optional, Tuple
from unicodedata import normalize
import nltk
import spacy
from dateutil.parser import parse
from nltk.corpus import stopwords
from pydantic import BaseModel, PrivateAttr, Field
from spacy import Language # type: ignore
from spacy.cli import download # type: ignore
cleaner_func_logger: logging.Logger = logging.getLogger(__name__)
class TextCleaningFunction(BaseModel):
@abstractmethod
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
pass
class ToLowerCase(TextCleaningFunction):
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
return [token.lower() for token in tokens]
class RemoveWhiteSpaceAndEmptyToken(TextCleaningFunction):
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
striped_tokens = [token.strip() for token in tokens]
return [token for token in striped_tokens if token != ""]
# Removes words that don't add any meaning to the sequence
class RemoveStopWords(TextCleaningFunction):
stop_words: Optional[List[str]] = None
language: Optional[str] = "english"
def __init__(self, **data: Any):
super().__init__(**data)
if not self.stop_words:
try:
nltk.data.find("stopwords")
except LookupError:
nltk.download("stopwords")
self.stop_words = stopwords.words(self.language)
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
if not self.stop_words:
return tokens
return [token for token in tokens if token not in self.stop_words]
class RemovePunctuation(TextCleaningFunction):
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
return [
token.translate(token.maketrans("", "", string.punctuation))
for token in tokens
if len(token.translate(token.maketrans("", "", string.punctuation)))
]
# Transforms tokens to standardized form
class TokenStemming(TextCleaningFunction):
stemmer: Optional[Any] = None
def __init__(self, **data: Any):
super().__init__(**data)
if not self.stemmer:
try:
from nltk.stem import PorterStemmer
self.stemmer = PorterStemmer()
except ImportError:
cleaner_func_logger.warning(
"NLTK module is not installed hence token stemming will not work"
)
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
if not self.stemmer:
return tokens
return [self.stemmer.stem(token) for token in tokens]
class RemoveSpecialChars(TextCleaningFunction):
"""
Removes special characters by eliminating all characters from each token
and only retains alphabetic, numeric or alphanumeric tokens by stripping
special characters from them
"""
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
cleaned_tokens = [re.sub("[^A-Za-z0-9]+", "", token) for token in tokens]
return [token for token in cleaned_tokens if token != ""]
# Converts unicodes to ASCII characters
class DecodeUnicode(TextCleaningFunction):
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
return [
normalize("NFKD", token).encode("ascii", "ignore").decode("utf-8")
for token in tokens
]
class RemoveDateTime(TextCleaningFunction):
_white_space_cleaner = RemoveWhiteSpaceAndEmptyToken()
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
text: str = " ".join(tokens)
try:
fuzzy_tokens: Tuple[str]
_, fuzzy_tokens = parse(text, fuzzy_with_tokens=True) # type: ignore
tokens = " ".join(fuzzy_tokens).split()
except ValueError:
cleaner_func_logger.warning("Token contain invalid date time format")
return self._white_space_cleaner.execute(tokens)
# Replaces domain specific keywords
class ReplaceDomainKeywords(TextCleaningFunction):
domain_keywords: Optional[List[Tuple[str, str]]] = None
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
# don't do anything when no domain keywords specified
if not self.domain_keywords or len(self.domain_keywords) == 0:
return tokens
text: str = " ".join(tokens)
for source_keyword, target_keyword in self.domain_keywords:
if source_keyword in text or source_keyword.lower() in text:
text = text.replace(source_keyword, target_keyword)
tokens = text.split()
return tokens
class RegExSubstitute(TextCleaningFunction):
pattern: Optional[str] = None
substitute: Optional[str] = None
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
if not self.pattern or not self.substitute:
return tokens
compiled_regex = re.compile(self.pattern)
return [compiled_regex.sub(self.substitute, token) for token in tokens]
class SpacyLemmatization(TextCleaningFunction):
_nlp: Language = PrivateAttr()
model_name_or_path: str = Field("en_core_web_sm")
batch_size: int = 4
n_process: int = 1
def __init__(self, **data: Any):
super().__init__(**data)
try:
self._nlp = spacy.load(
self.model_name_or_path,
disable=["parser", "ner"],
)
except:
download(self.model_name_or_path)
self._nlp = spacy.load(
self.model_name_or_path,
disable=["parser", "ner"],
)
def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
processed_tokens: List[str] = []
for doc in self._nlp.pipe(texts=tokens, batch_size=self.batch_size, n_process=self.n_process):
processed_tokens.append(" ".join([token.lemma_ for token in doc]))
return processed_tokens
|