File size: 6,080 Bytes
dbaa71b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import logging
import re
import string
from abc import abstractmethod
from typing import Any, List, Optional, Tuple
from unicodedata import normalize

import nltk
import spacy
from dateutil.parser import parse
from nltk.corpus import stopwords
from pydantic import BaseModel, PrivateAttr, Field
from spacy import Language  # type: ignore
from spacy.cli import download  # type: ignore

cleaner_func_logger: logging.Logger = logging.getLogger(__name__)


class TextCleaningFunction(BaseModel):
    @abstractmethod
    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        pass


class ToLowerCase(TextCleaningFunction):
    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        return [token.lower() for token in tokens]


class RemoveWhiteSpaceAndEmptyToken(TextCleaningFunction):
    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        striped_tokens = [token.strip() for token in tokens]
        return [token for token in striped_tokens if token != ""]


# Removes words that don't add any meaning to the sequence
class RemoveStopWords(TextCleaningFunction):
    stop_words: Optional[List[str]] = None
    language: Optional[str] = "english"

    def __init__(self, **data: Any):
        super().__init__(**data)
        if not self.stop_words:
            try:
                nltk.data.find("stopwords")
            except LookupError:
                nltk.download("stopwords")
            self.stop_words = stopwords.words(self.language)

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        if not self.stop_words:
            return tokens
        return [token for token in tokens if token not in self.stop_words]


class RemovePunctuation(TextCleaningFunction):
    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        return [
            token.translate(token.maketrans("", "", string.punctuation))
            for token in tokens
            if len(token.translate(token.maketrans("", "", string.punctuation)))
        ]


# Transforms tokens to standardized form
class TokenStemming(TextCleaningFunction):
    stemmer: Optional[Any] = None

    def __init__(self, **data: Any):
        super().__init__(**data)
        if not self.stemmer:
            try:
                from nltk.stem import PorterStemmer

                self.stemmer = PorterStemmer()
            except ImportError:
                cleaner_func_logger.warning(
                    "NLTK module is not installed hence token stemming will not work"
                )

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        if not self.stemmer:
            return tokens
        return [self.stemmer.stem(token) for token in tokens]


class RemoveSpecialChars(TextCleaningFunction):
    """
    Removes special characters by eliminating all characters from each token
    and only retains alphabetic, numeric or alphanumeric tokens by stripping
    special characters from them
    """

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        cleaned_tokens = [re.sub("[^A-Za-z0-9]+", "", token) for token in tokens]
        return [token for token in cleaned_tokens if token != ""]


# Converts unicodes to ASCII characters
class DecodeUnicode(TextCleaningFunction):
    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        return [
            normalize("NFKD", token).encode("ascii", "ignore").decode("utf-8")
            for token in tokens
        ]


class RemoveDateTime(TextCleaningFunction):
    _white_space_cleaner = RemoveWhiteSpaceAndEmptyToken()

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        text: str = " ".join(tokens)
        try:
            fuzzy_tokens: Tuple[str]
            _, fuzzy_tokens = parse(text, fuzzy_with_tokens=True)  # type: ignore
            tokens = " ".join(fuzzy_tokens).split()
        except ValueError:
            cleaner_func_logger.warning("Token contain invalid date time format")
        return self._white_space_cleaner.execute(tokens)


# Replaces domain specific keywords
class ReplaceDomainKeywords(TextCleaningFunction):
    domain_keywords: Optional[List[Tuple[str, str]]] = None

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        # don't do anything when no domain keywords specified
        if not self.domain_keywords or len(self.domain_keywords) == 0:
            return tokens

        text: str = " ".join(tokens)
        for source_keyword, target_keyword in self.domain_keywords:
            if source_keyword in text or source_keyword.lower() in text:
                text = text.replace(source_keyword, target_keyword)
        tokens = text.split()
        return tokens


class RegExSubstitute(TextCleaningFunction):
    pattern: Optional[str] = None
    substitute: Optional[str] = None

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        if not self.pattern or not self.substitute:
            return tokens

        compiled_regex = re.compile(self.pattern)

        return [compiled_regex.sub(self.substitute, token) for token in tokens]


class SpacyLemmatization(TextCleaningFunction):
    _nlp: Language = PrivateAttr()
    model_name_or_path: str = Field("en_core_web_sm")
    batch_size: int = 4
    n_process: int = 1

    def __init__(self, **data: Any):
        super().__init__(**data)
        try:
            self._nlp = spacy.load(
                self.model_name_or_path,
                disable=["parser", "ner"],
            )
        except:
            download(self.model_name_or_path)
            self._nlp = spacy.load(
                self.model_name_or_path,
                disable=["parser", "ner"],
            )

    def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
        processed_tokens: List[str] = []
        for doc in self._nlp.pipe(texts=tokens, batch_size=self.batch_size, n_process=self.n_process):
            processed_tokens.append(" ".join([token.lemma_ for token in doc]))
        return processed_tokens