Spaces:
Sleeping
Sleeping
import nltk, json | |
from nltk.stem import PorterStemmer | |
from nltk.corpus import stopwords | |
# from app.main import get_metadata | |
from config import config | |
class Preprocessor: | |
def _tokenize(self, text): | |
text = text.lower().split(' ') | |
return text | |
def preprocessing_text(self, doc): | |
tokens = self._tokenize(doc) | |
nltk.download('stopwords') | |
stop_words = set(stopwords.words('english')) | |
filtered_tokens = [token for token in tokens if not token in stop_words] | |
stemmer = PorterStemmer() | |
stemmed_tokes = [stemmer.stem(filtered_token) for filtered_token in filtered_tokens] | |
preprocess_text = " ".join(stemmed_tokes) | |
return preprocess_text | |
def _save(self, docs): | |
with open("../preprocessing_text.json", "w") as f: | |
json.dump(docs, f, indent=4) | |
def preprocessing(self, docs): | |
preprocessed_docs = [self.preprocessing_text(doc) for doc in docs] | |
self._save(preprocessed_docs) | |
if __name__ == '__main__': | |
texts, _ = get_metadata(config.PATH_METADATA) | |
preprocessor = Preprocessor() | |
preprocessor.preprocessing(texts) | |