NLP / script /preprocessing_text.py
maximka608
test
b93b2dc
import nltk, json
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# from app.main import get_metadata
from config import config
class Preprocessor:
def _tokenize(self, text):
text = text.lower().split(' ')
return text
def preprocessing_text(self, doc):
tokens = self._tokenize(doc)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if not token in stop_words]
stemmer = PorterStemmer()
stemmed_tokes = [stemmer.stem(filtered_token) for filtered_token in filtered_tokens]
preprocess_text = " ".join(stemmed_tokes)
return preprocess_text
def _save(self, docs):
with open("../preprocessing_text.json", "w") as f:
json.dump(docs, f, indent=4)
def preprocessing(self, docs):
preprocessed_docs = [self.preprocessing_text(doc) for doc in docs]
self._save(preprocessed_docs)
if __name__ == '__main__':
texts, _ = get_metadata(config.PATH_METADATA)
preprocessor = Preprocessor()
preprocessor.preprocessing(texts)