Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os.path | |
import pathlib | |
import pandas as pd | |
import numpy as np | |
import PyPDF2 | |
from PyPDF2 import PdfReader | |
from os import walk | |
import nltk | |
import glob | |
import plotly.express as px | |
from wordcloud import WordCloud | |
import plotly.io as pio | |
from plotly.subplots import make_subplots | |
import plotly.graph_objs as go | |
import pandas as pd | |
import plotly.offline as pyo | |
import io | |
from io import StringIO | |
#@st.cache_resource() | |
def get_nl(): | |
return nltk.download('punkt') | |
get_nl() | |
from nltk.tokenize import sent_tokenize | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from transformers import pipeline | |
# if os.path.exists("report.html"): | |
# os.remove("report.html") | |
#@st.cache_resource() | |
def get_sentiment_model(): | |
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert") | |
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert") | |
return tokenizer,model | |
tokenizer_sentiment,model_sentiment = get_sentiment_model() | |
def get_emotion_model(): | |
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") | |
model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") | |
return tokenizer,model | |
tokenizer_emotion,model_emotion = get_emotion_model() | |
def extract_text_from_pdf(path): | |
text='' | |
reader = PdfReader(path) | |
number_of_pages = len(reader.pages) | |
print(number_of_pages) | |
for i in range(number_of_pages): | |
page=reader.pages[i] | |
text = text + page.extract_text() | |
return text | |
# Create a button to download the HTML file | |
def download_html(): | |
with st.spinner('Downloading HTML file...'): | |
# Get the HTML content | |
with open('report.html', "r") as f: | |
html = f.read() | |
f.close() | |
# Set the file name and content type | |
file_name = "report.html" | |
mime_type = "text/html" | |
# Use st.download_button() to create a download button | |
print('download button') | |
st.download_button(label="Download Report", data=html, file_name=file_name, mime=mime_type) | |
st.stop() | |
if 'filename_key' not in st.session_state: | |
st.session_state.filename_key = '' | |
st.write(""" | |
# Sentiment Analysis Tool | |
""") | |
#uploaded_file = st.file_uploader("Choose a PDF file") | |
#uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=False, type=['pdf']) | |
uploaded_file = st.file_uploader("Choose a PDF file", accept_multiple_files=True, type=['pdf']) | |
#if uploaded_file is not None: | |
if len(uploaded_file)==0: | |
#print('none') | |
st.session_state.filename_key = '' | |
elif len(uploaded_file)>0: | |
import time | |
# Wait for 5 seconds | |
time.sleep(5) | |
pdf_reader = PyPDF2.PdfReader(uploaded_file[0]) | |
num_pages = len(pdf_reader.pages) | |
file_name = uploaded_file[0].name | |
# st.write(st.session_state.filename_key) | |
# print(file_name) | |
# st.write("Filename:", file_name) | |
if num_pages > 20: | |
st.error("Pages in PDF file should be less than 20.") | |
# Check that only one file was uploaded | |
#elif isinstance(uploaded_file, list): | |
elif len(uploaded_file) > 1: | |
st.error("Please upload only one PDF file at a time.") | |
elif st.session_state.filename_key == file_name: | |
st.write("Report downloaded successfully") | |
else: | |
#uploaded_file = uploaded_file[0] | |
# Check that the file is a PDF | |
if uploaded_file[0].type != 'application/pdf': | |
st.error("Please upload a PDF file.") | |
else: | |
############################ 1. Extract text from PDF ############################ | |
text='' | |
# return text from pdf | |
pdf_reader = PyPDF2.PdfReader(uploaded_file[0]) | |
# Get the number of pages in the PDF file | |
num_pages = len(pdf_reader.pages) | |
# Display the number of pages in the PDF file | |
st.write(f"Number of pages in PDF file: {num_pages}") | |
for i in range(num_pages): | |
page=pdf_reader.pages[i] | |
text = text + page.extract_text() | |
############################ 2. Sentiment Analysis ############################ | |
text = text.replace("\n", " " ) | |
sentences = sent_tokenize(text) | |
title = sentences[0] | |
long_sentence=[] | |
small_sentence=[] | |
useful_sentence=[] | |
for i in sentences: | |
if len(i) > 510: | |
long_sentence.append(i) | |
elif len(i) < 50: | |
small_sentence.append(i) | |
else: | |
useful_sentence.append(i) | |
del sentences | |
with st.spinner('Performing Sentiment Analysis...'): | |
tokenizer = tokenizer_sentiment | |
model = model_sentiment | |
pipe = pipeline(model="ProsusAI/finbert") | |
classifier = pipeline(model="ProsusAI/finbert") | |
output = classifier(useful_sentence) | |
with st.spinner('Performing Emotion Analysis...'): | |
tokenizer = tokenizer_emotion | |
model = model_emotion | |
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1) | |
output_emotion = classifier(useful_sentence) | |
df = pd.DataFrame.from_dict(output) | |
df['Sentence']= pd.Series(useful_sentence) | |
############################ 3. Processing ############################ | |
labels = ['neutral', 'positive', 'negative'] | |
values = df.label.value_counts().to_list() | |
# removing words | |
words_to_remove = ["s", "quarter", "thank", "million", "Thank", "quetion", 'wa', 'rate', 'firt', | |
"customer", "business", "last year", "year", 'lat', 'well', 'jut', 'thi', 'cutomer', | |
"will", "think", "higher", "question", "going"] | |
for word in words_to_remove: | |
text = text.replace(word, "") | |
wordcloud = WordCloud(background_color='white', width=800, height=400).generate(text) | |
image = wordcloud.to_image() | |
pos_df = df[df['label']=='positive'] | |
pos_df = pos_df[['score', 'Sentence']] | |
pos_df = pos_df.sort_values('score', ascending=False) | |
pos_df_mean = pos_df.score.mean() | |
pos_df['score'] = pos_df['score'].round(4) | |
pos_df.rename(columns = {'Sentence':'Positive Sentences'}, inplace = True) | |
neg_df = df[df['label']=='negative'] | |
neg_df = neg_df[['score', 'Sentence']] | |
neg_df = neg_df.sort_values('score', ascending=False) | |
neg_df_mean = neg_df.score.mean() | |
neg_df['score'] = neg_df['score'].round(4) | |
neg_df.rename(columns = {'Sentence':'Negative Sentences'}, inplace = True) | |
neu_df = df[df['label']=='neutral'] | |
neu_df = neu_df[['score', 'Sentence']] | |
neu_df = neu_df.sort_values('score', ascending=False) | |
#neu_df_mean = neu_df.score.mean() | |
neu_df['score'] = neu_df['score'].round(4) | |
neu_df.rename(columns = {'Sentence':'Neutral Sentences'}, inplace = True) | |
df_temp = neg_df | |
df_temp = df_temp['score'] * -1 | |
df_temp = pd.concat([df_temp, pos_df]) | |
############################ 4. Plotting ############################ | |
fig = make_subplots( | |
rows=26, cols=6, | |
specs=[ [None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[{"type": "pie", "rowspan": 6, "colspan": 2}, None, {"type": "indicator", "rowspan": 6, "colspan": 2}, None, {"type": "indicator", "rowspan": 6, "colspan": 2}, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[{"type": "image", "rowspan": 15, "colspan": 3}, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, {"type": "table", "rowspan": 5, "colspan": 3}, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
[None, None, None, None, None, None], | |
], | |
) | |
colors = px.colors.diverging.Portland#RdBu | |
fig.add_trace(go.Pie(labels=labels, values=values, hole = 0.5, | |
title = 'Count by label', | |
marker=dict(colors=colors, | |
line=dict(width=2, color='white'))), | |
row=6, col=1) | |
fig.add_trace(go.Indicator( | |
mode = "number", | |
value = len(df.label.values.tolist()), | |
title = {"text": "Count of Sentence"}), row=6, col=3) | |
fig.add_trace(go.Indicator( | |
mode = "gauge+number", | |
value = df_temp.score.mean(), | |
domain = {'x': [0, 1], 'y': [0, 1]}, | |
title = {'text': "Average of Score", 'font': {'size': 16}}, | |
gauge = { | |
'axis': {'range': [-1, 1], 'tickwidth': 1, 'tickcolor': "darkblue"}, | |
'bar': {'color': "darkblue"}, | |
'steps': [ | |
{'range': [-0.29, 0.29], 'color': 'white'}, | |
{'range': [0.3, 1], 'color': 'green'}, | |
{'range': [-1, -0.3], 'color': 'red'} | |
], | |
'threshold': { | |
'line': {'color': "black", 'width': 4}, | |
'thickness': 0.75, | |
'value': abs((pos_df_mean - neg_df_mean)) | |
} | |
} | |
), row=6, col=5) | |
if df_temp.score.mean() < -0.29: | |
fig.update_traces(title_text="Cummulative Sentiment Negative", selector=dict(type='indicator'), row=6, col=5) | |
elif df_temp.score.mean() < 0.29: | |
fig.update_traces(title_text="Cummulative Sentiment Neutral", selector=dict(type='indicator'), row=6, col=5) | |
else: | |
fig.update_traces(title_text="Cummulative Sentiment Positive", selector=dict(type='indicator'), row=6, col=5) | |
fig.add_trace(go.Image(z=image), row=12, col=1) | |
fig.update_xaxes(visible=False, row=12, col=1) | |
fig.update_yaxes(visible=False, row=12, col=1) | |
table_trace1 = go.Table( | |
header=dict(values=list(pos_df.columns), fill_color='lightgray', align='left'), | |
cells=dict(values=[pos_df[name] for name in pos_df.columns], fill_color='white', align='left'), | |
columnwidth=[1, 4] | |
) | |
fig.add_trace(table_trace1, row=12, col=4) | |
table_trace2 = go.Table( | |
header=dict(values=list(neg_df.columns), fill_color='lightgray', align='left'), | |
cells=dict(values=[neg_df[name] for name in neg_df.columns], fill_color='white', align='left'), | |
columnwidth=[1, 4] | |
) | |
fig.add_trace(table_trace2, row=17, col=4) | |
table_trace2 = go.Table( | |
header=dict(values=list(neu_df.columns), fill_color='lightgray', align='left'), | |
cells=dict(values=[neu_df[name] for name in neu_df.columns], fill_color='white', align='left'), | |
columnwidth=[1, 4] | |
) | |
fig.add_trace(table_trace2, row=22, col=4) | |
import textwrap | |
wrapped_title = "\n".join(textwrap.wrap(title, width=50)) | |
# Add HTML tags to force line breaks in the title text | |
wrapped_title = "<br>".join(wrapped_title.split("\n")) | |
fig.update_layout(height=1500, showlegend=False, title={'text': f"<b>{wrapped_title} - Sentiment Analysis Report</b>", 'x': 0.5, 'xanchor': 'center','font': {'size': 32}}) | |
#pyo.plot(fig, filename='report.html') | |
############################## 5. Download Report ############################## | |
buffer = io.StringIO() | |
fig.write_html(buffer, include_plotlyjs='cdn') | |
html_bytes = buffer.getvalue().encode() | |
st.download_button( | |
label='Download Report', | |
data=html_bytes, | |
file_name='report.html', | |
mime='text/html' | |
) | |
st.session_state.filename_key = file_name |