Spaces:

pikaduck
/

setfit-twitter-data-sentiment-analysis

Runtime error

App Files Files Community

setfit-twitter-data-sentiment-analysis / predict_ml.py

pikaduck

added download of nltk resources in predict_ml.py

f364968 over 2 years ago

raw

history blame contribute delete

4.46 kB

	"""
	@author : Sakshi Tantak
	"""

	# Imports
	import re
	import string
	import pickle
	from time import time

	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords

	import emoji

	from paths import COUNT_VECTORIZER_PATH, TFIDF_VECTORIZER_PATH, NB_MODEL_PATH as MODEL_PATH

	nltk.download('punkt')
	nltk.download('omw-1.4')
	nltk.download('stopwords')
	nltk.download('wordnet')

	stops = stopwords.words('english')
	negatives = ['no','nor','not','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',
	"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",
	'wasn',"wasn't",'weren',"weren't","won't",'wouldn',"wouldn't",'don',"don't"]
	stops = set([stop for stop in stops if stop not in negatives])

	lemmatizer = WordNetLemmatizer()
	MODEL, COUNT_VECTORIZER, TFIDF = None, None, None

	def clean_text(text):
	text = re.sub(r'[\.]+', '.', text)
	# print(text)
	text = re.sub(r'[\!]+', '!', text)
	# print(text)
	text = re.sub(r'[\?]+', '!', text)
	# print(text)
	text = re.sub(r'\s+', ' ', text).strip().lower()
	# print(text)
	text = re.sub(r'@\w+', '', text).strip().lower()
	# print(text)
	text = re.sub(r'\s[n]+[o]+', ' no', text)
	# print(text)
	text = re.sub(r'n\'t', 'n not', text)
	# print(text)
	text = re.sub(r'\'nt', 'n not', text)
	# print(text)
	text = re.sub(r'\'re', ' are', text)
	# print(text)
	text = re.sub(r'\'s', ' is', text)
	# print(text)
	text = re.sub(r'\'d', ' would', text)
	# print(text)
	text = re.sub(r'\'ll', ' will', text)
	# print(text)
	text = re.sub(r'\'ve', ' have', text)
	# print(text)
	text = re.sub(r'\'m', ' am', text)
	# print(text)
	# map variations of nope to no
	text = re.sub(r'\s[n]+[o]+[p]+[e]+', ' no', text)
	# print(text)
	# clean websites mentioned in text
	text = re.sub(r'(https\|http)?:\/\/(\w\|\.\|\/\|\?\|\=\|\&\|\%\|\~)*\b', '', text, flags=re.MULTILINE).strip()
	# print(text)
	text = re.sub(r'(www.)(\w\|\.\|\/\|\?\|\=\|\&\|\%)*\b', '', text, flags=re.MULTILINE).strip()
	# print(text)
	text = re.sub(r'\w+.com', '', text).strip()
	# print(text)
	text = emoji.demojize(text)
	return text

	def remove_punctuation(text):
	translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
	text = text.translate(translator)
	return re.sub(r'\s+', ' ', text).strip()

	def remove_numbers(text):
	return re.sub(r'[0-9]+', '', text)

	def remove_stopwords_and_lemmatize(text):
	tokens = word_tokenize(text)
	tokens = [token.strip() for token in tokens if token.strip() not in stops]
	tokens = [lemmatizer.lemmatize(token) for token in tokens]
	return ' '.join(tokens)

	def load_model():
	global MODEL, COUNT_VECTORIZER, TFIDF

	if MODEL is None:
	with open(MODEL_PATH, 'rb') as f:
	print('Loading classifier ...')
	start = time()
	MODEL = pickle.load(f)
	print(f'Time taken to load model = {time() - start}')
	f.close()

	if COUNT_VECTORIZER is None:
	with open(COUNT_VECTORIZER_PATH, 'rb') as f:
	print('Loading count vectorizer ...')
	start = time()
	COUNT_VECTORIZER = pickle.load(f)
	print(f'Time taken to load count vectorizer = {time() - start}')
	f.close()

	if TFIDF is None:
	with open(TFIDF_VECTORIZER_PATH, 'rb') as f:
	print('Loading tfidf vectorizer ...')
	start = time()
	TFIDF = pickle.load(f)
	print(f'Time taken to load tfidf vectorizer = {time() - start}')
	f.close()

	def predict(text):
	if MODEL is None:
	load_model()

	text = clean_text(text)
	text = remove_numbers(text)
	text = remove_punctuation(text)
	text = remove_stopwords_and_lemmatize(text)

	vector = COUNT_VECTORIZER.transform([text]).toarray()
	vector = TFIDF.transform(vector).toarray()
	start = time()
	prediction = MODEL.predict(vector)
	print(prediction)
	prediction = MODEL.predict(vector).item()
	print(f'Inference time = {time() - start}')
	return ('positive', 1) if prediction == 1 else ('negative', 1)

	if __name__ == '__main__':
	text = input('Enter tweet : ')
	# text = "i am so bored!!!"
	prediction = predict(text)
	print(text, ' : ', prediction)