File size: 4,493 Bytes
defe71f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548fac3
defe71f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c1e479
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import requests
import pandas as pd
import altair as alt
from collections import OrderedDict
from nltk.tokenize import sent_tokenize
import trafilatura
import validators

# Load the punkt tokenizer from nltk
import nltk
nltk.download('punkt')

# Load model and tokenizer
model_name = 'dejanseo/sentiment'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Sentiment labels as textual descriptions
sentiment_labels = {
    0: "very positive",
    1: "positive",
    2: "somewhat positive",
    3: "neutral",
    4: "somewhat negative",
    5: "negative",
    6: "very negative"
}

# Background colors for sentiments
background_colors = {
    "very positive": "rgba(0, 255, 0, 0.5)",
    "positive": "rgba(0, 255, 0, 0.3)",
    "somewhat positive": "rgba(0, 255, 0, 0.1)",
    "neutral": "rgba(128, 128, 128, 0.1)",
    "somewhat negative": "rgba(255, 0, 0, 0.1)",
    "negative": "rgba(255, 0, 0, 0.3)",
    "very negative": "rgba(255, 0, 0, 0.5)"
}

# Function to get text content from a URL, restricted to Medium stories/articles
def get_text_from_url(url):
    if not validators.url(url):
        return None, "Invalid URL"
    
    if "medium.com/" not in url:  # Check if it's a Medium URL
        return None, "URL is not a Medium story/article."
    
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            return trafilatura.extract(downloaded), None
        else:
            return None, "Could not download content from URL."
    except Exception as e:
        return None, f"Error extracting text: {e}"

# ... (rest of the functions: classify_text, classify_long_text, classify_sentences remain the same)

# Streamlit UI
st.title("Sentiment Classification Model (Medium Only)")

url = st.text_input("Enter Medium URL:")

if url:
    text, error_message = get_text_from_url(url)

    if error_message:
        st.error(error_message)  # Display error message
    elif text:
        # ... (rest of the analysis and display code remains the same)
        scores, chunk_scores_list, chunks = classify_long_text(text)
        scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}

        # Ensure the exact order of labels in the graph
        sentiment_order = [
            "very positive", "positive", "somewhat positive",
            "neutral",
            "somewhat negative", "negative", "very negative"
        ]
        ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)

        # Prepare the DataFrame and reindex
        df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)

        # Use Altair to plot the bar chart
        chart = alt.Chart(df.reset_index()).mark_bar().encode(
            x=alt.X('index', sort=sentiment_order, title='Sentiment'),
            y='Likelihood'
        ).properties(
            width=600,
            height=400
        )

        st.altair_chart(chart, use_container_width=True)

        # Display each chunk and its own chart
        for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
            chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
            ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
            df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)

            chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
                x=alt.X('index', sort=sentiment_order, title='Sentiment'),
                y='Likelihood'
            ).properties(
                width=600,
                height=400
            )

            st.write(f"Chunk {i + 1}:")
            st.write(chunk)
            st.altair_chart(chunk_chart, use_container_width=True)

        # Sentence-level classification with background colors
        st.write("Extracted Text with Sentiment Highlights:")
        sentence_scores = classify_sentences(text)
        for sentence, sentiment in sentence_scores:
            bg_color = background_colors[sentiment]
            st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)