Spaces:
Sleeping
Sleeping
File size: 4,493 Bytes
defe71f 548fac3 defe71f 8c1e479 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import requests
import pandas as pd
import altair as alt
from collections import OrderedDict
from nltk.tokenize import sent_tokenize
import trafilatura
import validators
# Load the punkt tokenizer from nltk
import nltk
nltk.download('punkt')
# Load model and tokenizer
model_name = 'dejanseo/sentiment'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Sentiment labels as textual descriptions
sentiment_labels = {
0: "very positive",
1: "positive",
2: "somewhat positive",
3: "neutral",
4: "somewhat negative",
5: "negative",
6: "very negative"
}
# Background colors for sentiments
background_colors = {
"very positive": "rgba(0, 255, 0, 0.5)",
"positive": "rgba(0, 255, 0, 0.3)",
"somewhat positive": "rgba(0, 255, 0, 0.1)",
"neutral": "rgba(128, 128, 128, 0.1)",
"somewhat negative": "rgba(255, 0, 0, 0.1)",
"negative": "rgba(255, 0, 0, 0.3)",
"very negative": "rgba(255, 0, 0, 0.5)"
}
# Function to get text content from a URL, restricted to Medium stories/articles
def get_text_from_url(url):
if not validators.url(url):
return None, "Invalid URL"
if "medium.com/" not in url: # Check if it's a Medium URL
return None, "URL is not a Medium story/article."
try:
downloaded = trafilatura.fetch_url(url)
if downloaded:
return trafilatura.extract(downloaded), None
else:
return None, "Could not download content from URL."
except Exception as e:
return None, f"Error extracting text: {e}"
# ... (rest of the functions: classify_text, classify_long_text, classify_sentences remain the same)
# Streamlit UI
st.title("Sentiment Classification Model (Medium Only)")
url = st.text_input("Enter Medium URL:")
if url:
text, error_message = get_text_from_url(url)
if error_message:
st.error(error_message) # Display error message
elif text:
# ... (rest of the analysis and display code remains the same)
scores, chunk_scores_list, chunks = classify_long_text(text)
scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
# Ensure the exact order of labels in the graph
sentiment_order = [
"very positive", "positive", "somewhat positive",
"neutral",
"somewhat negative", "negative", "very negative"
]
ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
# Prepare the DataFrame and reindex
df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
# Use Altair to plot the bar chart
chart = alt.Chart(df.reset_index()).mark_bar().encode(
x=alt.X('index', sort=sentiment_order, title='Sentiment'),
y='Likelihood'
).properties(
width=600,
height=400
)
st.altair_chart(chart, use_container_width=True)
# Display each chunk and its own chart
for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
x=alt.X('index', sort=sentiment_order, title='Sentiment'),
y='Likelihood'
).properties(
width=600,
height=400
)
st.write(f"Chunk {i + 1}:")
st.write(chunk)
st.altair_chart(chunk_chart, use_container_width=True)
# Sentence-level classification with background colors
st.write("Extracted Text with Sentiment Highlights:")
sentence_scores = classify_sentences(text)
for sentence, sentiment in sentence_scores:
bg_color = background_colors[sentiment]
st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True) |