Spaces:
Runtime error
Runtime error
File size: 4,133 Bytes
8c46649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import pandas as pd
import re
from textblob import TextBlob
import numpy as np
import nltk
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm.notebook import tqdm
sia=SentimentIntensityAnalyzer()
nltk.download('vader_lexicon')
def preprocess(data):
pattern ='\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'
messages = re.split(pattern, data)[1:]
dates = re.findall(pattern, data)
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M - ')
df.rename(columns={'message_date': 'date'}, inplace=True)
users = []
messages = []
for message in df['user_message']:
entry = re.split('([\w\W]+?):\s', message)
if entry[1:]:
users.append(entry[1])
messages.append(entry[2])
else:
users.append('group_notification')
messages.append(entry[0])
df['users'] = users
df['message'] = messages
df.drop(columns=['user_message'], inplace=True)
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['Day_name'] = df['date'].dt.day_name()
df['Month_name'] = df['date'].dt.month_name()
temp = df[df['users'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
temp.replace("", np.nan, inplace=True)
temp = temp.dropna()
def cleanTxt(text):
text = re.sub(r'@[A-Za-z0-9]+', '', text)
text = re.sub(r'#', '', text)
text = text.replace('\n', "")
return text
temp['message'] = temp['message'].apply(cleanTxt)
temp['users'] = temp['users'].apply(cleanTxt)
res = {}
for i, row in tqdm(temp.iterrows(), total=len(temp)):
text = row['message']
myid = row['users']
res[myid] = sia.polarity_scores(text)
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'users'})
vaders = vaders.merge(temp, how="right")
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
def polarity_scores_roberts(example):
encoded_text = tokenizer(example, return_tensors="pt")
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
'roberta_neg': scores[0],
'roberta_neu': scores[1],
'roberta_pos': scores[2]
}
return scores_dict
res = {}
for i, row in tqdm(vaders.iterrows(), total=len(vaders)):
try:
text = row['message']
myid = row['users']
vader_result = sia.polarity_scores(text)
vader_result_rename = {}
for key, value in vader_result.items():
vader_result_rename[f"vader_{key}"] = value
roberta_result = polarity_scores_roberts(text)
both = {**vader_result, **roberta_result}
res[myid] = both
except RuntimeError:
print(f"Broke for id {myid}")
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'users'})
results_df = results_df.merge(vaders, how="right")
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
return TextBlob(text).sentiment.polarity
results_df['Subjectivity'] = results_df['message'].apply(getSubjectivity)
results_df['Polarity'] = results_df['message'].apply(getPolarity)
def getAnalysis(score):
if score < 0:
return 'Negative'
if score == 0:
return 'Neutral'
else:
return 'Positive'
results_df['Analysis'] = results_df['Polarity'].apply(getAnalysis)
return results_df |