File size: 995 Bytes
33d6818
 
 
 
 
 
 
 
 
 
 
 
71506cb
33d6818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding: utf-8 -*-
"""
Created on Mon May  8 23:58:07 2023

@author: ME
"""
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
lm = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

class Preprocessing:
  def __init__(self,data):
     self.data = data

  def preprocess_text(self):
        lm = WordNetLemmatizer()
        #initialise corpus to store texts p
        pred_data = [self.data]
        preprocessed_data = []

        for data in pred_data:
            review = re.sub("a-zA-Z0-9"," ",data)
            review = review.lower() #convert to lower case
            review = review.split() #Tokenize text
            review = [lm.lemmatize(x) for x in review if x not in list(stop_words)] #lemmatize and removing stopwords
            review  = " ".join(review) #join as text
            preprocessed_data.append(review)
            
        return preprocessed_data