File size: 1,146 Bytes
b93b2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import nltk, json
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# from app.main import get_metadata
from config import config


class Preprocessor:
    def _tokenize(self, text):
        text = text.lower().split(' ')
        return text

    def preprocessing_text(self, doc):
        tokens = self._tokenize(doc)

        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in tokens if not token in stop_words]

        stemmer = PorterStemmer()
        stemmed_tokes = [stemmer.stem(filtered_token) for filtered_token in filtered_tokens]
        preprocess_text =  " ".join(stemmed_tokes)
        return preprocess_text

    def _save(self, docs):
        with open("../preprocessing_text.json", "w") as f:
            json.dump(docs, f, indent=4)

    def preprocessing(self, docs):
        preprocessed_docs = [self.preprocessing_text(doc) for doc in docs]
        self._save(preprocessed_docs)

if __name__ == '__main__':
    texts, _ = get_metadata(config.PATH_METADATA)
    preprocessor = Preprocessor()
    preprocessor.preprocessing(texts)