Spaces:
Runtime error
Runtime error
Create analyzer.py
Browse files- analyzer.py +22 -273
analyzer.py
CHANGED
@@ -31,7 +31,6 @@ class MessageAnalyzer:
|
|
31 |
"false equivalence": 0.317,
|
32 |
"future faking": 0.385
|
33 |
}
|
34 |
-
|
35 |
def analyze_message(self, text):
|
36 |
"""Analyze a single message for abuse patterns"""
|
37 |
from utils import (
|
@@ -186,7 +185,6 @@ class MessageAnalyzer:
|
|
186 |
'boundary_assessment': {'assessment': 'error', 'confidence': 0.0},
|
187 |
'risk_level': "Unknown"
|
188 |
}
|
189 |
-
|
190 |
def identify_primary_abuser(self, results_df):
|
191 |
"""Identify the primary abuser based on comprehensive abuse metrics with pattern severity weighting"""
|
192 |
logger.info("Identifying primary abuser...")
|
@@ -297,9 +295,8 @@ class MessageAnalyzer:
|
|
297 |
|
298 |
logger.info(f"Primary abuser identified: {primary_abuser}")
|
299 |
return primary_abuser, sender_abuse_metrics
|
300 |
-
|
301 |
def analyze_chat_history(self, df):
|
302 |
-
"""Analyze entire chat history
|
303 |
from utils import detect_escalation_patterns, generate_safety_plan, generate_professional_recommendations
|
304 |
|
305 |
logger.info(f"Analyzing chat history with {len(df)} messages")
|
@@ -330,10 +327,7 @@ class MessageAnalyzer:
|
|
330 |
results_df.at[i, 'boundary_health'] = analysis['boundary_assessment']['assessment']
|
331 |
results_df.at[i, 'risk_level'] = analysis['risk_level']
|
332 |
|
333 |
-
#
|
334 |
-
primary_abuser, sender_abuse_metrics = self.identify_primary_abuser(results_df)
|
335 |
-
|
336 |
-
# Calculate traditional sender statistics for backward compatibility
|
337 |
sender_stats = {}
|
338 |
for sender in results_df['sender'].unique():
|
339 |
sender_df = results_df[results_df['sender'] == sender]
|
@@ -382,107 +376,47 @@ class MessageAnalyzer:
|
|
382 |
'risk_levels': risk_counts
|
383 |
}
|
384 |
|
385 |
-
#
|
386 |
-
|
387 |
-
logger.info("No primary abuser identified - providing general analysis")
|
388 |
-
|
389 |
-
# Detect escalation patterns
|
390 |
-
escalation_data = detect_escalation_patterns(results_df)
|
391 |
-
|
392 |
-
# Determine overall risk level
|
393 |
-
if results_df['risk_level'].isin(['Critical']).any():
|
394 |
-
overall_risk = "Critical"
|
395 |
-
elif results_df['risk_level'].isin(['High']).any():
|
396 |
-
overall_risk = "High"
|
397 |
-
elif results_df['risk_level'].isin(['Moderate']).any():
|
398 |
-
overall_risk = "Moderate"
|
399 |
-
else:
|
400 |
-
overall_risk = "Low"
|
401 |
-
|
402 |
-
# Generate safety plan
|
403 |
-
all_patterns = []
|
404 |
-
for patterns in results_df['detected_patterns']:
|
405 |
-
if patterns:
|
406 |
-
all_patterns.extend(patterns)
|
407 |
-
|
408 |
-
safety_plan = generate_safety_plan(overall_risk, all_patterns, escalation_data)
|
409 |
-
|
410 |
-
# Generate recommendations
|
411 |
-
recommendations = generate_professional_recommendations(results_df, escalation_data, overall_risk)
|
412 |
-
|
413 |
-
# Prepare summary
|
414 |
-
summary = {
|
415 |
-
'message_count': len(results_df),
|
416 |
-
'date_range': {
|
417 |
-
'start': results_df['timestamp'].min().strftime('%Y-%m-%d'),
|
418 |
-
'end': results_df['timestamp'].max().strftime('%Y-%m-%d')
|
419 |
-
},
|
420 |
-
'overall_risk_level': overall_risk,
|
421 |
-
'sender_stats': sender_stats,
|
422 |
-
'sender_abuse_metrics': sender_abuse_metrics,
|
423 |
-
'primary_abuser': None,
|
424 |
-
'primary_abuser_analysis': None,
|
425 |
-
'escalation_data': escalation_data,
|
426 |
-
'safety_plan': safety_plan,
|
427 |
-
'recommendations': recommendations,
|
428 |
-
'analysis_focus': 'general'
|
429 |
-
}
|
430 |
-
|
431 |
-
return results_df, summary
|
432 |
-
|
433 |
-
# Focus analysis on primary abuser
|
434 |
-
logger.info(f"Focusing analysis on primary abuser: {primary_abuser}")
|
435 |
-
abuser_df = results_df[results_df['sender'] == primary_abuser]
|
436 |
-
victim_df = results_df[results_df['sender'] != primary_abuser]
|
437 |
-
|
438 |
-
# Generate comprehensive primary abuser analysis
|
439 |
-
primary_abuser_analysis = self._analyze_primary_abuser(
|
440 |
-
abuser_df, victim_df, results_df, primary_abuser
|
441 |
-
) if hasattr(self, '_analyze_primary_abuser') else None
|
442 |
|
443 |
-
# Detect escalation patterns
|
444 |
-
escalation_data = detect_escalation_patterns(
|
445 |
|
446 |
-
# Determine overall risk level
|
447 |
-
|
448 |
-
if 'Critical' in abuser_risk_levels and abuser_risk_levels['Critical'] > 0:
|
449 |
overall_risk = "Critical"
|
450 |
-
elif '
|
451 |
overall_risk = "High"
|
452 |
-
elif '
|
453 |
overall_risk = "Moderate"
|
454 |
else:
|
455 |
overall_risk = "Low"
|
456 |
|
457 |
-
# Generate safety plan
|
458 |
-
|
459 |
-
for patterns in
|
460 |
if patterns:
|
461 |
-
|
462 |
|
463 |
-
safety_plan = generate_safety_plan(overall_risk,
|
464 |
|
465 |
-
# Generate recommendations
|
466 |
-
recommendations = generate_professional_recommendations(
|
467 |
|
468 |
-
# Prepare
|
469 |
summary = {
|
470 |
'message_count': len(results_df),
|
471 |
-
'abuser_message_count': len(abuser_df),
|
472 |
-
'victim_message_count': len(victim_df),
|
473 |
'date_range': {
|
474 |
'start': results_df['timestamp'].min().strftime('%Y-%m-%d'),
|
475 |
'end': results_df['timestamp'].max().strftime('%Y-%m-%d')
|
476 |
},
|
477 |
'overall_risk_level': overall_risk,
|
478 |
-
'sender_stats': sender_stats,
|
479 |
-
'sender_abuse_metrics': sender_abuse_metrics,
|
480 |
'primary_abuser': primary_abuser,
|
481 |
-
'primary_abuser_analysis': primary_abuser_analysis,
|
482 |
'escalation_data': escalation_data,
|
483 |
'safety_plan': safety_plan,
|
484 |
-
'recommendations': recommendations
|
485 |
-
'analysis_focus': 'primary_abuser' # Flag to indicate focused analysis
|
486 |
}
|
487 |
|
488 |
return results_df, summary
|
@@ -500,192 +434,7 @@ class MessageAnalyzer:
|
|
500 |
'sender_stats': {},
|
501 |
'sender_abuse_metrics': {},
|
502 |
'primary_abuser': None,
|
503 |
-
'primary_abuser_analysis': None,
|
504 |
'escalation_data': {},
|
505 |
'safety_plan': "Error generating safety plan.",
|
506 |
-
'recommendations': []
|
507 |
-
'analysis_focus': 'error'
|
508 |
}
|
509 |
-
|
510 |
-
def _analyze_primary_abuser(self, abuser_df, victim_df, full_df, primary_abuser):
|
511 |
-
"""Generate comprehensive analysis of the primary abuser"""
|
512 |
-
|
513 |
-
# Basic statistics
|
514 |
-
avg_abuse = abuser_df['abuse_score'].mean()
|
515 |
-
max_abuse = abuser_df['abuse_score'].max()
|
516 |
-
abusive_count = len(abuser_df[abuser_df['abuse_score'] >= 50])
|
517 |
-
abusive_pct = (abusive_count / len(abuser_df)) * 100
|
518 |
-
|
519 |
-
# Pattern analysis
|
520 |
-
all_patterns = []
|
521 |
-
for patterns in abuser_df['detected_patterns']:
|
522 |
-
if patterns:
|
523 |
-
all_patterns.extend(patterns)
|
524 |
-
|
525 |
-
pattern_counts = Counter(all_patterns)
|
526 |
-
most_common_patterns = pattern_counts.most_common(10)
|
527 |
-
|
528 |
-
# Get example messages for top patterns
|
529 |
-
pattern_examples = {}
|
530 |
-
for pattern, count in most_common_patterns[:5]: # Top 5 patterns
|
531 |
-
pattern_msgs = abuser_df[abuser_df['detected_patterns'].apply(lambda x: pattern in x)]
|
532 |
-
if not pattern_msgs.empty:
|
533 |
-
# Get highest scoring example
|
534 |
-
example = pattern_msgs.iloc[pattern_msgs['abuse_score'].argmax()]
|
535 |
-
pattern_examples[pattern] = {
|
536 |
-
'message': example['message'],
|
537 |
-
'abuse_score': example['abuse_score'],
|
538 |
-
'timestamp': example['timestamp'].strftime('%Y-%m-%d %H:%M'),
|
539 |
-
'frequency': count
|
540 |
-
}
|
541 |
-
|
542 |
-
# Temporal patterns
|
543 |
-
abuser_df_copy = abuser_df.copy()
|
544 |
-
abuser_df_copy['hour'] = abuser_df_copy['timestamp'].dt.hour
|
545 |
-
abuser_df_copy['day_of_week'] = abuser_df_copy['timestamp'].dt.day_name()
|
546 |
-
|
547 |
-
# Peak abuse times
|
548 |
-
hour_abuse = abuser_df_copy.groupby('hour')['abuse_score'].mean()
|
549 |
-
peak_hours = hour_abuse.nlargest(3).index.tolist() if not hour_abuse.empty else []
|
550 |
-
|
551 |
-
day_abuse = abuser_df_copy.groupby('day_of_week')['abuse_score'].mean()
|
552 |
-
peak_days = day_abuse.nlargest(3).index.tolist() if not day_abuse.empty else []
|
553 |
-
|
554 |
-
# Response pattern analysis
|
555 |
-
response_patterns = self._analyze_response_patterns(full_df, primary_abuser)
|
556 |
-
|
557 |
-
# Escalation triggers
|
558 |
-
escalation_triggers = self._identify_escalation_triggers(abuser_df, victim_df)
|
559 |
-
|
560 |
-
# Emotional and psychological profile
|
561 |
-
emotional_profile = {
|
562 |
-
'emotional_tones': Counter(abuser_df['emotional_tone']).most_common(5),
|
563 |
-
'avg_darvo_score': abuser_df['darvo_score'].mean(),
|
564 |
-
'high_darvo_incidents': len(abuser_df[abuser_df['darvo_score'] >= 0.65]),
|
565 |
-
'boundary_violations': len(abuser_df[abuser_df['boundary_health'] == 'unhealthy']),
|
566 |
-
'sentiment_distribution': Counter(abuser_df['sentiment']).most_common()
|
567 |
-
}
|
568 |
-
|
569 |
-
# Risk assessment
|
570 |
-
risk_distribution = Counter(abuser_df['risk_level'])
|
571 |
-
critical_incidents = abuser_df[abuser_df['risk_level'] == 'Critical']
|
572 |
-
|
573 |
-
return {
|
574 |
-
'sender': primary_abuser,
|
575 |
-
'message_count': len(abuser_df),
|
576 |
-
'avg_abuse_score': avg_abuse,
|
577 |
-
'max_abuse_score': max_abuse,
|
578 |
-
'abusive_message_count': abusive_count,
|
579 |
-
'abusive_message_pct': abusive_pct,
|
580 |
-
'most_common_patterns': most_common_patterns,
|
581 |
-
'pattern_examples': pattern_examples,
|
582 |
-
'peak_hours': peak_hours,
|
583 |
-
'peak_days': peak_days,
|
584 |
-
'response_patterns': response_patterns,
|
585 |
-
'escalation_triggers': escalation_triggers,
|
586 |
-
'emotional_profile': emotional_profile,
|
587 |
-
'risk_distribution': dict(risk_distribution),
|
588 |
-
'critical_incidents': len(critical_incidents),
|
589 |
-
'critical_incident_examples': [
|
590 |
-
{
|
591 |
-
'message': row['message'],
|
592 |
-
'score': row['abuse_score'],
|
593 |
-
'patterns': row['detected_patterns'],
|
594 |
-
'timestamp': row['timestamp'].strftime('%Y-%m-%d %H:%M')
|
595 |
-
}
|
596 |
-
for _, row in critical_incidents.head(3).iterrows()
|
597 |
-
] if not critical_incidents.empty else []
|
598 |
-
}
|
599 |
-
|
600 |
-
def _analyze_response_patterns(self, full_df, primary_abuser):
|
601 |
-
"""Analyze how the primary abuser responds to the victim"""
|
602 |
-
response_patterns = []
|
603 |
-
|
604 |
-
if len(full_df) < 10:
|
605 |
-
return response_patterns
|
606 |
-
|
607 |
-
sorted_df = full_df.sort_values('timestamp')
|
608 |
-
|
609 |
-
for i in range(1, len(sorted_df)):
|
610 |
-
current_msg = sorted_df.iloc[i]
|
611 |
-
previous_msg = sorted_df.iloc[i-1]
|
612 |
-
|
613 |
-
# Check if this is abuser responding to victim
|
614 |
-
if (current_msg['sender'] == primary_abuser and
|
615 |
-
previous_msg['sender'] != primary_abuser and
|
616 |
-
current_msg['abuse_score'] >= 40): # Lowered threshold for response analysis
|
617 |
-
|
618 |
-
response_patterns.append({
|
619 |
-
'trigger_message': previous_msg['message'][:100] + "..." if len(previous_msg['message']) > 100 else previous_msg['message'],
|
620 |
-
'trigger_sentiment': previous_msg['sentiment'],
|
621 |
-
'response_message': current_msg['message'][:100] + "..." if len(current_msg['message']) > 100 else current_msg['message'],
|
622 |
-
'response_score': current_msg['abuse_score'],
|
623 |
-
'response_patterns': current_msg['detected_patterns'],
|
624 |
-
'timestamp': current_msg['timestamp'].strftime('%Y-%m-%d %H:%M')
|
625 |
-
})
|
626 |
-
|
627 |
-
# Return top 5 most abusive responses
|
628 |
-
return sorted(response_patterns, key=lambda x: x['response_score'], reverse=True)[:5]
|
629 |
-
|
630 |
-
def _identify_escalation_triggers(self, abuser_df, victim_df):
|
631 |
-
"""Identify what triggers escalation in the abuser's behavior"""
|
632 |
-
# This is a simplified version - could be expanded with more sophisticated analysis
|
633 |
-
triggers = []
|
634 |
-
|
635 |
-
# Look for patterns in high-abuse messages
|
636 |
-
high_abuse_msgs = abuser_df[abuser_df['abuse_score'] >= 70]
|
637 |
-
|
638 |
-
if not high_abuse_msgs.empty:
|
639 |
-
# Common words/themes in high-abuse messages
|
640 |
-
high_abuse_text = ' '.join(high_abuse_msgs['message'].str.lower())
|
641 |
-
|
642 |
-
# Simple keyword analysis (could be enhanced with NLP)
|
643 |
-
trigger_keywords = ['leave', 'divorce', 'break up', 'end', 'done', 'over', 'police', 'help', 'family', 'friends']
|
644 |
-
found_triggers = [word for word in trigger_keywords if word in high_abuse_text]
|
645 |
-
|
646 |
-
triggers.extend(found_triggers)
|
647 |
-
|
648 |
-
return list(set(triggers)) # Remove duplicates
|
649 |
-
|
650 |
-
def _generate_general_analysis(self, results_df, sender_abuse_metrics):
|
651 |
-
"""Generate general analysis when no primary abuser is identified"""
|
652 |
-
from utils import detect_escalation_patterns, generate_safety_plan, generate_professional_recommendations
|
653 |
-
|
654 |
-
# Calculate sender statistics for all participants
|
655 |
-
sender_stats = {}
|
656 |
-
for sender in results_df['sender'].unique():
|
657 |
-
sender_df = results_df[results_df['sender'] == sender]
|
658 |
-
|
659 |
-
avg_abuse = sender_df['abuse_score'].mean()
|
660 |
-
max_abuse = sender_df['abuse_score'].max()
|
661 |
-
|
662 |
-
all_patterns = []
|
663 |
-
for patterns in sender_df['detected_patterns']:
|
664 |
-
if patterns:
|
665 |
-
all_patterns.extend(patterns)
|
666 |
-
|
667 |
-
pattern_counts = Counter(all_patterns)
|
668 |
-
most_common = pattern_counts.most_common(5)
|
669 |
-
|
670 |
-
abusive_count = len(sender_df[sender_df['abuse_score'] >= 50])
|
671 |
-
abusive_pct = (abusive_count / len(sender_df)) * 100 if len(sender_df) > 0 else 0
|
672 |
-
|
673 |
-
tone_counts = Counter(sender_df['emotional_tone'])
|
674 |
-
most_common_tones = tone_counts.most_common(3)
|
675 |
-
|
676 |
-
avg_darvo = sender_df['darvo_score'].mean()
|
677 |
-
high_darvo_count = len(sender_df[sender_df['darvo_score'] >= 0.65])
|
678 |
-
high_darvo_pct = (high_darvo_count / len(sender_df)) * 100 if len(sender_df) > 0 else 0
|
679 |
-
|
680 |
-
risk_counts = Counter(sender_df['risk_level'])
|
681 |
-
|
682 |
-
sender_stats[sender] = {
|
683 |
-
'message_count': len(sender_df),
|
684 |
-
'avg_abuse_score': avg_abuse,
|
685 |
-
'max_abuse_score': max_abuse,
|
686 |
-
'abusive_message_count': abusive_count,
|
687 |
-
'abusive_message_pct': abusive_pct,
|
688 |
-
'common_patterns': most_common,
|
689 |
-
'emotional_tones': most_common_tones,
|
690 |
-
'avg_darvo_score': avg_darvo,
|
691 |
-
'high_
|
|
|
31 |
"false equivalence": 0.317,
|
32 |
"future faking": 0.385
|
33 |
}
|
|
|
34 |
def analyze_message(self, text):
|
35 |
"""Analyze a single message for abuse patterns"""
|
36 |
from utils import (
|
|
|
185 |
'boundary_assessment': {'assessment': 'error', 'confidence': 0.0},
|
186 |
'risk_level': "Unknown"
|
187 |
}
|
|
|
188 |
def identify_primary_abuser(self, results_df):
|
189 |
"""Identify the primary abuser based on comprehensive abuse metrics with pattern severity weighting"""
|
190 |
logger.info("Identifying primary abuser...")
|
|
|
295 |
|
296 |
logger.info(f"Primary abuser identified: {primary_abuser}")
|
297 |
return primary_abuser, sender_abuse_metrics
|
|
|
298 |
def analyze_chat_history(self, df):
|
299 |
+
"""Analyze entire chat history"""
|
300 |
from utils import detect_escalation_patterns, generate_safety_plan, generate_professional_recommendations
|
301 |
|
302 |
logger.info(f"Analyzing chat history with {len(df)} messages")
|
|
|
327 |
results_df.at[i, 'boundary_health'] = analysis['boundary_assessment']['assessment']
|
328 |
results_df.at[i, 'risk_level'] = analysis['risk_level']
|
329 |
|
330 |
+
# Calculate sender statistics
|
|
|
|
|
|
|
331 |
sender_stats = {}
|
332 |
for sender in results_df['sender'].unique():
|
333 |
sender_df = results_df[results_df['sender'] == sender]
|
|
|
376 |
'risk_levels': risk_counts
|
377 |
}
|
378 |
|
379 |
+
# Identify primary abuser
|
380 |
+
primary_abuser, sender_abuse_metrics = self.identify_primary_abuser(results_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
|
382 |
+
# Detect escalation patterns
|
383 |
+
escalation_data = detect_escalation_patterns(results_df)
|
384 |
|
385 |
+
# Determine overall risk level
|
386 |
+
if results_df['risk_level'].isin(['Critical']).any():
|
|
|
387 |
overall_risk = "Critical"
|
388 |
+
elif results_df['risk_level'].isin(['High']).any():
|
389 |
overall_risk = "High"
|
390 |
+
elif results_df['risk_level'].isin(['Moderate']).any():
|
391 |
overall_risk = "Moderate"
|
392 |
else:
|
393 |
overall_risk = "Low"
|
394 |
|
395 |
+
# Generate safety plan
|
396 |
+
all_patterns = []
|
397 |
+
for patterns in results_df['detected_patterns']:
|
398 |
if patterns:
|
399 |
+
all_patterns.extend(patterns)
|
400 |
|
401 |
+
safety_plan = generate_safety_plan(overall_risk, all_patterns, escalation_data)
|
402 |
|
403 |
+
# Generate professional recommendations
|
404 |
+
recommendations = generate_professional_recommendations(results_df, escalation_data, overall_risk)
|
405 |
|
406 |
+
# Prepare summary
|
407 |
summary = {
|
408 |
'message_count': len(results_df),
|
|
|
|
|
409 |
'date_range': {
|
410 |
'start': results_df['timestamp'].min().strftime('%Y-%m-%d'),
|
411 |
'end': results_df['timestamp'].max().strftime('%Y-%m-%d')
|
412 |
},
|
413 |
'overall_risk_level': overall_risk,
|
414 |
+
'sender_stats': sender_stats,
|
415 |
+
'sender_abuse_metrics': sender_abuse_metrics,
|
416 |
'primary_abuser': primary_abuser,
|
|
|
417 |
'escalation_data': escalation_data,
|
418 |
'safety_plan': safety_plan,
|
419 |
+
'recommendations': recommendations
|
|
|
420 |
}
|
421 |
|
422 |
return results_df, summary
|
|
|
434 |
'sender_stats': {},
|
435 |
'sender_abuse_metrics': {},
|
436 |
'primary_abuser': None,
|
|
|
437 |
'escalation_data': {},
|
438 |
'safety_plan': "Error generating safety plan.",
|
439 |
+
'recommendations': []
|
|
|
440 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|