Update app.py
Browse files
app.py
CHANGED
@@ -15,7 +15,6 @@ import spacy
|
|
15 |
|
16 |
st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
|
17 |
|
18 |
-
# Advanced NLP Libraries
|
19 |
from transformers import (
|
20 |
AutoTokenizer,
|
21 |
AutoModelForSequenceClassification,
|
@@ -29,13 +28,11 @@ from nltk.corpus import stopwords
|
|
29 |
from nltk.tokenize import word_tokenize
|
30 |
from textstat import flesch_reading_ease, flesch_kincaid_grade
|
31 |
|
32 |
-
# Download necessary NLTK resources
|
33 |
nltk.download('punkt', quiet=True)
|
34 |
nltk.download('averaged_perceptron_tagger', quiet=True)
|
35 |
nltk.download('stopwords', quiet=True)
|
36 |
nltk.download('punkt_tab', quiet=True)
|
37 |
|
38 |
-
# Load spaCy model (requires separate installation)
|
39 |
try:
|
40 |
nlp = spacy.load('en_core_web_lg')
|
41 |
except:
|
@@ -43,7 +40,6 @@ except:
|
|
43 |
"pip install spacy\n"
|
44 |
"python -m spacy download en_core_web_lg")
|
45 |
|
46 |
-
# Constants and Configurations
|
47 |
MORAL_FOUNDATIONS = {
|
48 |
'care': 'Care/Harm',
|
49 |
'fairness': 'Fairness/Cheating',
|
@@ -62,21 +58,17 @@ RHETORICAL_DEVICES = {
|
|
62 |
|
63 |
class SpeechAnalyzer:
|
64 |
def __init__(self):
|
65 |
-
# Load MoralFoundations model
|
66 |
self.moral_model_path = "MMADS/MoralFoundationsClassifier"
|
67 |
self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
|
68 |
self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
|
69 |
|
70 |
-
# Define label names directly
|
71 |
self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
|
72 |
|
73 |
-
# Other pipelines remain the same
|
74 |
self.sentiment_pipeline = pipeline("sentiment-analysis")
|
75 |
self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
76 |
self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
77 |
self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
|
78 |
|
79 |
-
# Add emotion classifier
|
80 |
self.emotion_classifier = pipeline("text-classification",
|
81 |
model="j-hartmann/emotion-english-distilroberta-base")
|
82 |
|
@@ -91,7 +83,6 @@ class SpeechAnalyzer:
|
|
91 |
for word in words:
|
92 |
if current_length + len(word.split()) > max_length:
|
93 |
segments.append(' '.join(current_segment))
|
94 |
-
# Use the overlap parameter from the method arguments
|
95 |
current_segment = current_segment[-overlap:] + [word]
|
96 |
current_length = len(' '.join(current_segment).split())
|
97 |
else:
|
@@ -125,7 +116,6 @@ class SpeechAnalyzer:
|
|
125 |
if foundation in foundation_scores:
|
126 |
foundation_scores[foundation].append(probabilities[0][idx].item())
|
127 |
|
128 |
-
# Average the scores across segments
|
129 |
aggregated_scores = {
|
130 |
foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
|
131 |
}
|
@@ -139,7 +129,6 @@ class SpeechAnalyzer:
|
|
139 |
basic_emotions = []
|
140 |
|
141 |
for segment in segments:
|
142 |
-
# Get sentiment scores with truncation
|
143 |
sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512)
|
144 |
score = sentiment_result[0]['score']
|
145 |
if sentiment_result[0]['label'] == 'POSITIVE':
|
@@ -148,7 +137,6 @@ class SpeechAnalyzer:
|
|
148 |
score = 0.5 - (score * 0.5)
|
149 |
sentiment_scores.append(score)
|
150 |
|
151 |
-
# Get emotion classification with truncation
|
152 |
emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512)
|
153 |
emotion = emotion_result[0]['label']
|
154 |
basic_emotions.append(emotion)
|
@@ -167,7 +155,6 @@ class SpeechAnalyzer:
|
|
167 |
tfidf_matrix = vectorizer.fit_transform([text])
|
168 |
feature_names = vectorizer.get_feature_names_out()
|
169 |
|
170 |
-
# Get top phrases by TF-IDF score
|
171 |
sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
|
172 |
top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
|
173 |
|
@@ -191,29 +178,23 @@ class SpeechAnalyzer:
|
|
191 |
|
192 |
def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
|
193 |
"""Create semantic network graph with weighted edges"""
|
194 |
-
# Process text in chunks
|
195 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
196 |
|
197 |
-
# Initialize collections for aggregating results
|
198 |
all_nouns = []
|
199 |
noun_freq = nltk.FreqDist()
|
200 |
|
201 |
-
# Process each chunk
|
202 |
for chunk in chunks:
|
203 |
doc = nlp(chunk)
|
204 |
chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
|
205 |
all_nouns.extend(chunk_nouns)
|
206 |
noun_freq.update(chunk_nouns)
|
207 |
|
208 |
-
# Get top nouns across all chunks
|
209 |
top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
|
210 |
|
211 |
-
# Create graph and co-occurrence matrix
|
212 |
G = nx.Graph()
|
213 |
cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
|
214 |
noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
|
215 |
|
216 |
-
# Process co-occurrences in chunks
|
217 |
for chunk in chunks:
|
218 |
doc = nlp(chunk)
|
219 |
words = [token.text.lower() for token in doc]
|
@@ -228,11 +209,9 @@ class SpeechAnalyzer:
|
|
228 |
cooc_matrix[idx1][idx2] += 1
|
229 |
cooc_matrix[idx2][idx1] += 1
|
230 |
|
231 |
-
# Build network
|
232 |
for noun in top_nouns:
|
233 |
G.add_node(noun, size=noun_freq[noun])
|
234 |
|
235 |
-
# Add weighted edges
|
236 |
max_weight = np.max(cooc_matrix)
|
237 |
if max_weight > 0: # Prevent division by zero
|
238 |
for i in range(len(top_nouns)):
|
@@ -243,7 +222,6 @@ class SpeechAnalyzer:
|
|
243 |
weight=weight,
|
244 |
width=3 * (weight/max_weight))
|
245 |
|
246 |
-
# Calculate and store layout
|
247 |
pos = nx.spring_layout(G, k=1, iterations=50)
|
248 |
for node in G.nodes():
|
249 |
G.nodes[node]['pos'] = pos[node]
|
@@ -269,14 +247,11 @@ def process_all_analyses(text, _analyzer):
|
|
269 |
def main():
|
270 |
st.title("🗣️ Political Text Analysis Toolkit")
|
271 |
|
272 |
-
# Initialize analyzer
|
273 |
analyzer = SpeechAnalyzer()
|
274 |
|
275 |
-
# File upload
|
276 |
uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
|
277 |
|
278 |
if uploaded_file is not None:
|
279 |
-
# Read file (similar to previous implementation)
|
280 |
if uploaded_file.name.endswith('.txt'):
|
281 |
text = uploaded_file.getvalue().decode('utf-8')
|
282 |
elif uploaded_file.name.endswith('.docx'):
|
@@ -288,7 +263,6 @@ def main():
|
|
288 |
pdf_reader = PyPDF2.PdfReader(uploaded_file)
|
289 |
text = ' '.join([page.extract_text() for page in pdf_reader.pages])
|
290 |
|
291 |
-
# Create tabs for different analyses
|
292 |
progress_bar = st.progress(0)
|
293 |
status_text = st.empty()
|
294 |
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
@@ -305,7 +279,6 @@ def main():
|
|
305 |
st.subheader("Moral Foundations Analysis")
|
306 |
moral_scores = analyzer.analyze_moral_foundations(text)
|
307 |
|
308 |
-
# Plotly bar chart
|
309 |
moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
|
310 |
moral_df.index.name = 'Moral Foundation'
|
311 |
moral_df = moral_df.reset_index()
|
@@ -319,7 +292,6 @@ def main():
|
|
319 |
)
|
320 |
st.plotly_chart(fig)
|
321 |
|
322 |
-
# Detailed insights
|
323 |
for foundation, score in moral_scores.items():
|
324 |
st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
|
325 |
|
@@ -328,13 +300,10 @@ def main():
|
|
328 |
progress_bar.progress(40)
|
329 |
st.subheader("Speech Trajectory Analysis")
|
330 |
|
331 |
-
# Get cached data
|
332 |
segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer)
|
333 |
|
334 |
-
# Create unified figure
|
335 |
unified_fig = go.Figure()
|
336 |
|
337 |
-
# Add traces for each analysis type
|
338 |
viz_options = st.multiselect(
|
339 |
"Select analyses to display:",
|
340 |
["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"],
|
@@ -371,7 +340,6 @@ def main():
|
|
371 |
'Emotion': basic_emotions
|
372 |
})
|
373 |
|
374 |
-
# Create color mapping for emotions
|
375 |
emotion_colors = {
|
376 |
'joy': '#FFD700', # Gold
|
377 |
'sadness': '#4169E1', # Royal Blue
|
@@ -383,11 +351,11 @@ def main():
|
|
383 |
|
384 |
unified_fig.add_trace(go.Bar(
|
385 |
x=segment_labels,
|
386 |
-
y=[1] * len(basic_emotions),
|
387 |
-
name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}',
|
388 |
marker=dict(
|
389 |
color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions],
|
390 |
-
line=dict(width=1, color='#000000')
|
391 |
),
|
392 |
opacity=0.8,
|
393 |
hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>",
|
@@ -403,7 +371,6 @@ def main():
|
|
403 |
st.subheader("Linguistic Analysis")
|
404 |
readability = analyzer.calculate_readability(text)
|
405 |
|
406 |
-
# Readability metrics with context
|
407 |
col1, col2 = st.columns(2)
|
408 |
with col1:
|
409 |
score = readability['Flesch Reading Ease']
|
@@ -424,11 +391,9 @@ def main():
|
|
424 |
delta_color="normal"
|
425 |
)
|
426 |
|
427 |
-
# Enhanced key phrases display
|
428 |
st.subheader("Key Topics and Themes")
|
429 |
key_phrases = analyzer.extract_key_phrases(text)
|
430 |
|
431 |
-
# Create columns for better phrase organization
|
432 |
cols = st.columns(3)
|
433 |
for idx, phrase in enumerate(key_phrases):
|
434 |
col_idx = idx % 3
|
@@ -452,20 +417,16 @@ def main():
|
|
452 |
|
453 |
network_fig = go.Figure()
|
454 |
|
455 |
-
# Add edges with enhanced visual encoding
|
456 |
for edge in semantic_graph.edges():
|
457 |
x0, y0 = semantic_graph.nodes[edge[0]]['pos']
|
458 |
x1, y1 = semantic_graph.nodes[edge[1]]['pos']
|
459 |
weight = semantic_graph.edges[edge]['weight']
|
460 |
max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
|
461 |
|
462 |
-
# Normalize weight for visual encoding
|
463 |
normalized_weight = weight / max_weight
|
464 |
|
465 |
-
# Enhanced width scaling (more pronounced differences)
|
466 |
width = 2 + (normalized_weight * 8)
|
467 |
|
468 |
-
# Color gradient from light to dark based on weight
|
469 |
color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
|
470 |
|
471 |
network_fig.add_trace(go.Scatter(
|
@@ -480,7 +441,6 @@ def main():
|
|
480 |
hovertext=f'Relationship strength: {weight:.2f}'
|
481 |
))
|
482 |
|
483 |
-
# Enhanced nodes with better visibility
|
484 |
for node in semantic_graph.nodes():
|
485 |
x, y = semantic_graph.nodes[node]['pos']
|
486 |
size = semantic_graph.nodes[node]['size']
|
@@ -521,10 +481,8 @@ def main():
|
|
521 |
st.subheader("Named Entity Recognition")
|
522 |
named_entities = analyzer.detect_named_entities(text)
|
523 |
|
524 |
-
# Process entities
|
525 |
entities_df = pd.DataFrame(named_entities)
|
526 |
|
527 |
-
# Map entity types to friendly names
|
528 |
type_mapping = {
|
529 |
'B-PER': 'Person',
|
530 |
'I-PER': 'Person',
|
@@ -536,20 +494,17 @@ def main():
|
|
536 |
'I-MISC': 'Other'
|
537 |
}
|
538 |
|
539 |
-
# Clean and transform the data
|
540 |
display_df = pd.DataFrame({
|
541 |
'Term': entities_df['word'],
|
542 |
'Category': entities_df['entity'].map(type_mapping),
|
543 |
'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
|
544 |
})
|
545 |
|
546 |
-
# Group similar entities
|
547 |
grouped_df = display_df.groupby('Category').agg({
|
548 |
'Term': lambda x: ', '.join(set(x)),
|
549 |
'Confidence': 'count'
|
550 |
}).reset_index()
|
551 |
|
552 |
-
# Display results in an organized way
|
553 |
for category in grouped_df['Category'].unique():
|
554 |
category_data = grouped_df[grouped_df['Category'] == category]
|
555 |
st.write(f"### {category}")
|
|
|
15 |
|
16 |
st.set_page_config(page_title="Advanced Political Speech Analysis", page_icon="🗣️", layout="wide")
|
17 |
|
|
|
18 |
from transformers import (
|
19 |
AutoTokenizer,
|
20 |
AutoModelForSequenceClassification,
|
|
|
28 |
from nltk.tokenize import word_tokenize
|
29 |
from textstat import flesch_reading_ease, flesch_kincaid_grade
|
30 |
|
|
|
31 |
nltk.download('punkt', quiet=True)
|
32 |
nltk.download('averaged_perceptron_tagger', quiet=True)
|
33 |
nltk.download('stopwords', quiet=True)
|
34 |
nltk.download('punkt_tab', quiet=True)
|
35 |
|
|
|
36 |
try:
|
37 |
nlp = spacy.load('en_core_web_lg')
|
38 |
except:
|
|
|
40 |
"pip install spacy\n"
|
41 |
"python -m spacy download en_core_web_lg")
|
42 |
|
|
|
43 |
MORAL_FOUNDATIONS = {
|
44 |
'care': 'Care/Harm',
|
45 |
'fairness': 'Fairness/Cheating',
|
|
|
58 |
|
59 |
class SpeechAnalyzer:
|
60 |
def __init__(self):
|
|
|
61 |
self.moral_model_path = "MMADS/MoralFoundationsClassifier"
|
62 |
self.moral_tokenizer = RobertaTokenizer.from_pretrained(self.moral_model_path)
|
63 |
self.moral_model = RobertaForSequenceClassification.from_pretrained(self.moral_model_path)
|
64 |
|
|
|
65 |
self.label_names = ['care', 'fairness', 'loyalty', 'authority', 'sanctity']
|
66 |
|
|
|
67 |
self.sentiment_pipeline = pipeline("sentiment-analysis")
|
68 |
self.ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
69 |
self.ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
70 |
self.ner_pipeline = pipeline("ner", model=self.ner_model, tokenizer=self.ner_tokenizer)
|
71 |
|
|
|
72 |
self.emotion_classifier = pipeline("text-classification",
|
73 |
model="j-hartmann/emotion-english-distilroberta-base")
|
74 |
|
|
|
83 |
for word in words:
|
84 |
if current_length + len(word.split()) > max_length:
|
85 |
segments.append(' '.join(current_segment))
|
|
|
86 |
current_segment = current_segment[-overlap:] + [word]
|
87 |
current_length = len(' '.join(current_segment).split())
|
88 |
else:
|
|
|
116 |
if foundation in foundation_scores:
|
117 |
foundation_scores[foundation].append(probabilities[0][idx].item())
|
118 |
|
|
|
119 |
aggregated_scores = {
|
120 |
foundation: np.mean(scores) for foundation, scores in foundation_scores.items()
|
121 |
}
|
|
|
129 |
basic_emotions = []
|
130 |
|
131 |
for segment in segments:
|
|
|
132 |
sentiment_result = self.sentiment_pipeline(segment, truncation=True, max_length=512)
|
133 |
score = sentiment_result[0]['score']
|
134 |
if sentiment_result[0]['label'] == 'POSITIVE':
|
|
|
137 |
score = 0.5 - (score * 0.5)
|
138 |
sentiment_scores.append(score)
|
139 |
|
|
|
140 |
emotion_result = self.emotion_classifier(segment, truncation=True, max_length=512)
|
141 |
emotion = emotion_result[0]['label']
|
142 |
basic_emotions.append(emotion)
|
|
|
155 |
tfidf_matrix = vectorizer.fit_transform([text])
|
156 |
feature_names = vectorizer.get_feature_names_out()
|
157 |
|
|
|
158 |
sorted_idx = tfidf_matrix.toarray()[0].argsort()[::-1]
|
159 |
top_phrases = [feature_names[i] for i in sorted_idx[:top_n]]
|
160 |
|
|
|
178 |
|
179 |
def create_semantic_network(self, text, top_n=20, window_size=10, chunk_size=10000):
|
180 |
"""Create semantic network graph with weighted edges"""
|
|
|
181 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
182 |
|
|
|
183 |
all_nouns = []
|
184 |
noun_freq = nltk.FreqDist()
|
185 |
|
|
|
186 |
for chunk in chunks:
|
187 |
doc = nlp(chunk)
|
188 |
chunk_nouns = [token.text.lower() for token in doc if token.pos_ == 'NOUN']
|
189 |
all_nouns.extend(chunk_nouns)
|
190 |
noun_freq.update(chunk_nouns)
|
191 |
|
|
|
192 |
top_nouns = [noun for noun, freq in noun_freq.most_common(top_n)]
|
193 |
|
|
|
194 |
G = nx.Graph()
|
195 |
cooc_matrix = np.zeros((len(top_nouns), len(top_nouns)))
|
196 |
noun_to_idx = {noun: idx for idx, noun in enumerate(top_nouns)}
|
197 |
|
|
|
198 |
for chunk in chunks:
|
199 |
doc = nlp(chunk)
|
200 |
words = [token.text.lower() for token in doc]
|
|
|
209 |
cooc_matrix[idx1][idx2] += 1
|
210 |
cooc_matrix[idx2][idx1] += 1
|
211 |
|
|
|
212 |
for noun in top_nouns:
|
213 |
G.add_node(noun, size=noun_freq[noun])
|
214 |
|
|
|
215 |
max_weight = np.max(cooc_matrix)
|
216 |
if max_weight > 0: # Prevent division by zero
|
217 |
for i in range(len(top_nouns)):
|
|
|
222 |
weight=weight,
|
223 |
width=3 * (weight/max_weight))
|
224 |
|
|
|
225 |
pos = nx.spring_layout(G, k=1, iterations=50)
|
226 |
for node in G.nodes():
|
227 |
G.nodes[node]['pos'] = pos[node]
|
|
|
247 |
def main():
|
248 |
st.title("🗣️ Political Text Analysis Toolkit")
|
249 |
|
|
|
250 |
analyzer = SpeechAnalyzer()
|
251 |
|
|
|
252 |
uploaded_file = st.file_uploader("Upload Political Speech", type=['txt', 'docx', 'pdf'])
|
253 |
|
254 |
if uploaded_file is not None:
|
|
|
255 |
if uploaded_file.name.endswith('.txt'):
|
256 |
text = uploaded_file.getvalue().decode('utf-8')
|
257 |
elif uploaded_file.name.endswith('.docx'):
|
|
|
263 |
pdf_reader = PyPDF2.PdfReader(uploaded_file)
|
264 |
text = ' '.join([page.extract_text() for page in pdf_reader.pages])
|
265 |
|
|
|
266 |
progress_bar = st.progress(0)
|
267 |
status_text = st.empty()
|
268 |
tab1, tab2, tab3, tab4, tab5 = st.tabs([
|
|
|
279 |
st.subheader("Moral Foundations Analysis")
|
280 |
moral_scores = analyzer.analyze_moral_foundations(text)
|
281 |
|
|
|
282 |
moral_df = pd.DataFrame.from_dict(moral_scores, orient='index', columns=['Score'])
|
283 |
moral_df.index.name = 'Moral Foundation'
|
284 |
moral_df = moral_df.reset_index()
|
|
|
292 |
)
|
293 |
st.plotly_chart(fig)
|
294 |
|
|
|
295 |
for foundation, score in moral_scores.items():
|
296 |
st.write(f"**{MORAL_FOUNDATIONS[foundation]}**: {score:.2%}")
|
297 |
|
|
|
300 |
progress_bar.progress(40)
|
301 |
st.subheader("Speech Trajectory Analysis")
|
302 |
|
|
|
303 |
segments, segment_labels, sentiment_scores, basic_emotions, moral_trajectories = process_all_analyses(text, analyzer)
|
304 |
|
|
|
305 |
unified_fig = go.Figure()
|
306 |
|
|
|
307 |
viz_options = st.multiselect(
|
308 |
"Select analyses to display:",
|
309 |
["Sentiment Flow", "Moral Foundations Flow", "Basic Emotions Flow"],
|
|
|
340 |
'Emotion': basic_emotions
|
341 |
})
|
342 |
|
|
|
343 |
emotion_colors = {
|
344 |
'joy': '#FFD700', # Gold
|
345 |
'sadness': '#4169E1', # Royal Blue
|
|
|
351 |
|
352 |
unified_fig.add_trace(go.Bar(
|
353 |
x=segment_labels,
|
354 |
+
y=[1] * len(basic_emotions),
|
355 |
+
name=f'Emotions Found: {", ".join(sorted(set(basic_emotions)))}',
|
356 |
marker=dict(
|
357 |
color=[emotion_colors.get(e.lower(), '#808080') for e in basic_emotions],
|
358 |
+
line=dict(width=1, color='#000000')
|
359 |
),
|
360 |
opacity=0.8,
|
361 |
hovertemplate="Segment %{x}<br>Emotion: %{text}<extra></extra>",
|
|
|
371 |
st.subheader("Linguistic Analysis")
|
372 |
readability = analyzer.calculate_readability(text)
|
373 |
|
|
|
374 |
col1, col2 = st.columns(2)
|
375 |
with col1:
|
376 |
score = readability['Flesch Reading Ease']
|
|
|
391 |
delta_color="normal"
|
392 |
)
|
393 |
|
|
|
394 |
st.subheader("Key Topics and Themes")
|
395 |
key_phrases = analyzer.extract_key_phrases(text)
|
396 |
|
|
|
397 |
cols = st.columns(3)
|
398 |
for idx, phrase in enumerate(key_phrases):
|
399 |
col_idx = idx % 3
|
|
|
417 |
|
418 |
network_fig = go.Figure()
|
419 |
|
|
|
420 |
for edge in semantic_graph.edges():
|
421 |
x0, y0 = semantic_graph.nodes[edge[0]]['pos']
|
422 |
x1, y1 = semantic_graph.nodes[edge[1]]['pos']
|
423 |
weight = semantic_graph.edges[edge]['weight']
|
424 |
max_weight = max(d['weight'] for _, _, d in semantic_graph.edges(data=True))
|
425 |
|
|
|
426 |
normalized_weight = weight / max_weight
|
427 |
|
|
|
428 |
width = 2 + (normalized_weight * 8)
|
429 |
|
|
|
430 |
color = f'rgba(31, 119, 180, {0.3 + normalized_weight * 0.7})'
|
431 |
|
432 |
network_fig.add_trace(go.Scatter(
|
|
|
441 |
hovertext=f'Relationship strength: {weight:.2f}'
|
442 |
))
|
443 |
|
|
|
444 |
for node in semantic_graph.nodes():
|
445 |
x, y = semantic_graph.nodes[node]['pos']
|
446 |
size = semantic_graph.nodes[node]['size']
|
|
|
481 |
st.subheader("Named Entity Recognition")
|
482 |
named_entities = analyzer.detect_named_entities(text)
|
483 |
|
|
|
484 |
entities_df = pd.DataFrame(named_entities)
|
485 |
|
|
|
486 |
type_mapping = {
|
487 |
'B-PER': 'Person',
|
488 |
'I-PER': 'Person',
|
|
|
494 |
'I-MISC': 'Other'
|
495 |
}
|
496 |
|
|
|
497 |
display_df = pd.DataFrame({
|
498 |
'Term': entities_df['word'],
|
499 |
'Category': entities_df['entity'].map(type_mapping),
|
500 |
'Confidence': entities_df['score'].apply(lambda x: f"{x*100:.1f}%")
|
501 |
})
|
502 |
|
|
|
503 |
grouped_df = display_df.groupby('Category').agg({
|
504 |
'Term': lambda x: ', '.join(set(x)),
|
505 |
'Confidence': 'count'
|
506 |
}).reset_index()
|
507 |
|
|
|
508 |
for category in grouped_df['Category'].unique():
|
509 |
category_data = grouped_df[grouped_df['Category'] == category]
|
510 |
st.write(f"### {category}")
|