Spaces:
Runtime error
Runtime error
import pandas as pd | |
import re | |
from textblob import TextBlob | |
import numpy as np | |
import nltk | |
import nltk.data | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
from transformers import AutoTokenizer | |
from transformers import AutoModelForSequenceClassification | |
from scipy.special import softmax | |
from tqdm.notebook import tqdm | |
sia=SentimentIntensityAnalyzer() | |
nltk.download('vader_lexicon') | |
def preprocess(data): | |
pattern ='\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s' | |
messages = re.split(pattern, data)[1:] | |
dates = re.findall(pattern, data) | |
df = pd.DataFrame({'user_message': messages, 'message_date': dates}) | |
df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %H:%M - ') | |
df.rename(columns={'message_date': 'date'}, inplace=True) | |
users = [] | |
messages = [] | |
for message in df['user_message']: | |
entry = re.split('([\w\W]+?):\s', message) | |
if entry[1:]: | |
users.append(entry[1]) | |
messages.append(entry[2]) | |
else: | |
users.append('group_notification') | |
messages.append(entry[0]) | |
df['users'] = users | |
df['message'] = messages | |
df.drop(columns=['user_message'], inplace=True) | |
df['year'] = df['date'].dt.year | |
df['day'] = df['date'].dt.day | |
df['hour'] = df['date'].dt.hour | |
df['minute'] = df['date'].dt.minute | |
df['Day_name'] = df['date'].dt.day_name() | |
df['Month_name'] = df['date'].dt.month_name() | |
temp = df[df['users'] != 'group_notification'] | |
temp = temp[temp['message'] != '<Media omitted>\n'] | |
temp.replace("", np.nan, inplace=True) | |
temp = temp.dropna() | |
def cleanTxt(text): | |
text = re.sub(r'@[A-Za-z0-9]+', '', text) | |
text = re.sub(r'#', '', text) | |
text = text.replace('\n', "") | |
return text | |
temp['message'] = temp['message'].apply(cleanTxt) | |
temp['users'] = temp['users'].apply(cleanTxt) | |
res = {} | |
for i, row in tqdm(temp.iterrows(), total=len(temp)): | |
text = row['message'] | |
myid = row['users'] | |
res[myid] = sia.polarity_scores(text) | |
vaders = pd.DataFrame(res).T | |
vaders = vaders.reset_index().rename(columns={'index': 'users'}) | |
vaders = vaders.merge(temp, how="right") | |
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
model = AutoModelForSequenceClassification.from_pretrained(MODEL) | |
def polarity_scores_roberts(example): | |
encoded_text = tokenizer(example, return_tensors="pt") | |
output = model(**encoded_text) | |
scores = output[0][0].detach().numpy() | |
scores = softmax(scores) | |
scores_dict = { | |
'roberta_neg': scores[0], | |
'roberta_neu': scores[1], | |
'roberta_pos': scores[2] | |
} | |
return scores_dict | |
res = {} | |
for i, row in tqdm(vaders.iterrows(), total=len(vaders)): | |
try: | |
text = row['message'] | |
myid = row['users'] | |
vader_result = sia.polarity_scores(text) | |
vader_result_rename = {} | |
for key, value in vader_result.items(): | |
vader_result_rename[f"vader_{key}"] = value | |
roberta_result = polarity_scores_roberts(text) | |
both = {**vader_result, **roberta_result} | |
res[myid] = both | |
except RuntimeError: | |
print(f"Broke for id {myid}") | |
results_df = pd.DataFrame(res).T | |
results_df = results_df.reset_index().rename(columns={'index': 'users'}) | |
results_df = results_df.merge(vaders, how="right") | |
def getSubjectivity(text): | |
return TextBlob(text).sentiment.subjectivity | |
def getPolarity(text): | |
return TextBlob(text).sentiment.polarity | |
results_df['Subjectivity'] = results_df['message'].apply(getSubjectivity) | |
results_df['Polarity'] = results_df['message'].apply(getPolarity) | |
def getAnalysis(score): | |
if score < 0: | |
return 'Negative' | |
if score == 0: | |
return 'Neutral' | |
else: | |
return 'Positive' | |
results_df['Analysis'] = results_df['Polarity'].apply(getAnalysis) | |
return results_df |