marianeft commited on
Commit
defe71f
·
1 Parent(s): 11db685

Initial commit

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +125 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -8,7 +8,7 @@ sdk_version: 1.42.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Analyze sentiment in Medium
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Analyze sentiment in Medium articles
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import requests
5
+ import pandas as pd
6
+ import altair as alt
7
+ from collections import OrderedDict
8
+ from nltk.tokenize import sent_tokenize
9
+ import trafilatura
10
+ import validators
11
+
12
+ # Load the punkt tokenizer from nltk
13
+ import nltk
14
+ nltk.download('punkt')
15
+
16
+ # Load model and tokenizer
17
+ model_name = 'dejanseo/sentiment'
18
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+
21
+ # Sentiment labels as textual descriptions
22
+ sentiment_labels = {
23
+ 0: "very positive",
24
+ 1: "positive",
25
+ 2: "somewhat positive",
26
+ 3: "neutral",
27
+ 4: "somewhat negative",
28
+ 5: "negative",
29
+ 6: "very negative"
30
+ }
31
+
32
+ # Background colors for sentiments
33
+ background_colors = {
34
+ "very positive": "rgba(0, 255, 0, 0.5)",
35
+ "positive": "rgba(0, 255, 0, 0.3)",
36
+ "somewhat positive": "rgba(0, 255, 0, 0.1)",
37
+ "neutral": "rgba(128, 128, 128, 0.1)",
38
+ "somewhat negative": "rgba(255, 0, 0, 0.1)",
39
+ "negative": "rgba(255, 0, 0, 0.3)",
40
+ "very negative": "rgba(255, 0, 0, 0.5)"
41
+ }
42
+
43
+ # Function to get text content from a URL, restricted to Medium stories/articles
44
+ def get_text_from_url(url):
45
+ if not validators.url(url):
46
+ return None, "Invalid URL"
47
+
48
+ if "medium.com/" not in url: # Check if it's a Medium URL
49
+ return None, "URL is not a Medium story/article."
50
+
51
+ try:
52
+ downloaded = trafilatura.fetch_url(url)
53
+ if downloaded:
54
+ return trafilatura.extract(downloaded), None
55
+ else:
56
+ return None, "Could not download content from URL."
57
+ except Exception as e:
58
+ return None, f"Error extracting text: {e}"
59
+
60
+ # ... (rest of the functions: classify_text, classify_long_text, classify_sentences remain the same)
61
+
62
+ # Streamlit UI
63
+ st.title("Sentiment Classification Model by DEJAN (Medium Only)")
64
+
65
+ url = st.text_input("Enter Medium URL:")
66
+
67
+ if url:
68
+ text, error_message = get_text_from_url(url)
69
+
70
+ if error_message:
71
+ st.error(error_message) # Display error message
72
+ elif text:
73
+ # ... (rest of the analysis and display code remains the same)
74
+ scores, chunk_scores_list, chunks = classify_long_text(text)
75
+ scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
76
+
77
+ # Ensure the exact order of labels in the graph
78
+ sentiment_order = [
79
+ "very positive", "positive", "somewhat positive",
80
+ "neutral",
81
+ "somewhat negative", "negative", "very negative"
82
+ ]
83
+ ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
84
+
85
+ # Prepare the DataFrame and reindex
86
+ df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
87
+
88
+ # Use Altair to plot the bar chart
89
+ chart = alt.Chart(df.reset_index()).mark_bar().encode(
90
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
91
+ y='Likelihood'
92
+ ).properties(
93
+ width=600,
94
+ height=400
95
+ )
96
+
97
+ st.altair_chart(chart, use_container_width=True)
98
+
99
+ # Display each chunk and its own chart
100
+ for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
101
+ chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
102
+ ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
103
+ df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
104
+
105
+ chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
106
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
107
+ y='Likelihood'
108
+ ).properties(
109
+ width=600,
110
+ height=400
111
+ )
112
+
113
+ st.write(f"Chunk {i + 1}:")
114
+ st.write(chunk)
115
+ st.altair_chart(chunk_chart, use_container_width=True)
116
+
117
+ # Sentence-level classification with background colors
118
+ st.write("Extracted Text with Sentiment Highlights:")
119
+ sentence_scores = classify_sentences(text)
120
+ for sentence, sentiment in sentence_scores:
121
+ bg_color = background_colors[sentiment]
122
+ st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
123
+
124
+
125
+ # No 'else' needed here, as the error message is already handled above.
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ transformers
4
+ requests
5
+ trafilatura
6
+ pandas
7
+ altair
8
+ nltk