Spaces:
Sleeping
Sleeping
File size: 1,146 Bytes
b93b2dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import nltk, json
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# from app.main import get_metadata
from config import config
class Preprocessor:
def _tokenize(self, text):
text = text.lower().split(' ')
return text
def preprocessing_text(self, doc):
tokens = self._tokenize(doc)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if not token in stop_words]
stemmer = PorterStemmer()
stemmed_tokes = [stemmer.stem(filtered_token) for filtered_token in filtered_tokens]
preprocess_text = " ".join(stemmed_tokes)
return preprocess_text
def _save(self, docs):
with open("../preprocessing_text.json", "w") as f:
json.dump(docs, f, indent=4)
def preprocessing(self, docs):
preprocessed_docs = [self.preprocessing_text(doc) for doc in docs]
self._save(preprocessed_docs)
if __name__ == '__main__':
texts, _ = get_metadata(config.PATH_METADATA)
preprocessor = Preprocessor()
preprocessor.preprocessing(texts)
|