SamanthaStorm commited on
Commit
67ca8dc
·
verified ·
1 Parent(s): 2765991

Create analyzer.py

Browse files
Files changed (1) hide show
  1. analyzer.py +22 -273
analyzer.py CHANGED
@@ -31,7 +31,6 @@ class MessageAnalyzer:
31
  "false equivalence": 0.317,
32
  "future faking": 0.385
33
  }
34
-
35
  def analyze_message(self, text):
36
  """Analyze a single message for abuse patterns"""
37
  from utils import (
@@ -186,7 +185,6 @@ class MessageAnalyzer:
186
  'boundary_assessment': {'assessment': 'error', 'confidence': 0.0},
187
  'risk_level': "Unknown"
188
  }
189
-
190
  def identify_primary_abuser(self, results_df):
191
  """Identify the primary abuser based on comprehensive abuse metrics with pattern severity weighting"""
192
  logger.info("Identifying primary abuser...")
@@ -297,9 +295,8 @@ class MessageAnalyzer:
297
 
298
  logger.info(f"Primary abuser identified: {primary_abuser}")
299
  return primary_abuser, sender_abuse_metrics
300
-
301
  def analyze_chat_history(self, df):
302
- """Analyze entire chat history with focus on primary abuser"""
303
  from utils import detect_escalation_patterns, generate_safety_plan, generate_professional_recommendations
304
 
305
  logger.info(f"Analyzing chat history with {len(df)} messages")
@@ -330,10 +327,7 @@ class MessageAnalyzer:
330
  results_df.at[i, 'boundary_health'] = analysis['boundary_assessment']['assessment']
331
  results_df.at[i, 'risk_level'] = analysis['risk_level']
332
 
333
- # Identify primary abuser FIRST
334
- primary_abuser, sender_abuse_metrics = self.identify_primary_abuser(results_df)
335
-
336
- # Calculate traditional sender statistics for backward compatibility
337
  sender_stats = {}
338
  for sender in results_df['sender'].unique():
339
  sender_df = results_df[results_df['sender'] == sender]
@@ -382,107 +376,47 @@ class MessageAnalyzer:
382
  'risk_levels': risk_counts
383
  }
384
 
385
- # If no primary abuser identified, provide basic analysis
386
- if not primary_abuser:
387
- logger.info("No primary abuser identified - providing general analysis")
388
-
389
- # Detect escalation patterns
390
- escalation_data = detect_escalation_patterns(results_df)
391
-
392
- # Determine overall risk level
393
- if results_df['risk_level'].isin(['Critical']).any():
394
- overall_risk = "Critical"
395
- elif results_df['risk_level'].isin(['High']).any():
396
- overall_risk = "High"
397
- elif results_df['risk_level'].isin(['Moderate']).any():
398
- overall_risk = "Moderate"
399
- else:
400
- overall_risk = "Low"
401
-
402
- # Generate safety plan
403
- all_patterns = []
404
- for patterns in results_df['detected_patterns']:
405
- if patterns:
406
- all_patterns.extend(patterns)
407
-
408
- safety_plan = generate_safety_plan(overall_risk, all_patterns, escalation_data)
409
-
410
- # Generate recommendations
411
- recommendations = generate_professional_recommendations(results_df, escalation_data, overall_risk)
412
-
413
- # Prepare summary
414
- summary = {
415
- 'message_count': len(results_df),
416
- 'date_range': {
417
- 'start': results_df['timestamp'].min().strftime('%Y-%m-%d'),
418
- 'end': results_df['timestamp'].max().strftime('%Y-%m-%d')
419
- },
420
- 'overall_risk_level': overall_risk,
421
- 'sender_stats': sender_stats,
422
- 'sender_abuse_metrics': sender_abuse_metrics,
423
- 'primary_abuser': None,
424
- 'primary_abuser_analysis': None,
425
- 'escalation_data': escalation_data,
426
- 'safety_plan': safety_plan,
427
- 'recommendations': recommendations,
428
- 'analysis_focus': 'general'
429
- }
430
-
431
- return results_df, summary
432
-
433
- # Focus analysis on primary abuser
434
- logger.info(f"Focusing analysis on primary abuser: {primary_abuser}")
435
- abuser_df = results_df[results_df['sender'] == primary_abuser]
436
- victim_df = results_df[results_df['sender'] != primary_abuser]
437
-
438
- # Generate comprehensive primary abuser analysis
439
- primary_abuser_analysis = self._analyze_primary_abuser(
440
- abuser_df, victim_df, results_df, primary_abuser
441
- ) if hasattr(self, '_analyze_primary_abuser') else None
442
 
443
- # Detect escalation patterns (focus on abuser's messages)
444
- escalation_data = detect_escalation_patterns(abuser_df)
445
 
446
- # Determine overall risk level based on primary abuser
447
- abuser_risk_levels = abuser_df['risk_level'].value_counts()
448
- if 'Critical' in abuser_risk_levels and abuser_risk_levels['Critical'] > 0:
449
  overall_risk = "Critical"
450
- elif 'High' in abuser_risk_levels and abuser_risk_levels['High'] > 0:
451
  overall_risk = "High"
452
- elif 'Moderate' in abuser_risk_levels and abuser_risk_levels['Moderate'] > 0:
453
  overall_risk = "Moderate"
454
  else:
455
  overall_risk = "Low"
456
 
457
- # Generate safety plan based on abuser's patterns
458
- abuser_patterns = []
459
- for patterns in abuser_df['detected_patterns']:
460
  if patterns:
461
- abuser_patterns.extend(patterns)
462
 
463
- safety_plan = generate_safety_plan(overall_risk, abuser_patterns, escalation_data)
464
 
465
- # Generate recommendations focused on the abuser's behavior
466
- recommendations = generate_professional_recommendations(abuser_df, escalation_data, overall_risk)
467
 
468
- # Prepare focused summary
469
  summary = {
470
  'message_count': len(results_df),
471
- 'abuser_message_count': len(abuser_df),
472
- 'victim_message_count': len(victim_df),
473
  'date_range': {
474
  'start': results_df['timestamp'].min().strftime('%Y-%m-%d'),
475
  'end': results_df['timestamp'].max().strftime('%Y-%m-%d')
476
  },
477
  'overall_risk_level': overall_risk,
478
- 'sender_stats': sender_stats, # Include traditional sender stats for backward compatibility
479
- 'sender_abuse_metrics': sender_abuse_metrics, # Include detailed abuse metrics
480
  'primary_abuser': primary_abuser,
481
- 'primary_abuser_analysis': primary_abuser_analysis,
482
  'escalation_data': escalation_data,
483
  'safety_plan': safety_plan,
484
- 'recommendations': recommendations,
485
- 'analysis_focus': 'primary_abuser' # Flag to indicate focused analysis
486
  }
487
 
488
  return results_df, summary
@@ -500,192 +434,7 @@ class MessageAnalyzer:
500
  'sender_stats': {},
501
  'sender_abuse_metrics': {},
502
  'primary_abuser': None,
503
- 'primary_abuser_analysis': None,
504
  'escalation_data': {},
505
  'safety_plan': "Error generating safety plan.",
506
- 'recommendations': [],
507
- 'analysis_focus': 'error'
508
  }
509
-
510
- def _analyze_primary_abuser(self, abuser_df, victim_df, full_df, primary_abuser):
511
- """Generate comprehensive analysis of the primary abuser"""
512
-
513
- # Basic statistics
514
- avg_abuse = abuser_df['abuse_score'].mean()
515
- max_abuse = abuser_df['abuse_score'].max()
516
- abusive_count = len(abuser_df[abuser_df['abuse_score'] >= 50])
517
- abusive_pct = (abusive_count / len(abuser_df)) * 100
518
-
519
- # Pattern analysis
520
- all_patterns = []
521
- for patterns in abuser_df['detected_patterns']:
522
- if patterns:
523
- all_patterns.extend(patterns)
524
-
525
- pattern_counts = Counter(all_patterns)
526
- most_common_patterns = pattern_counts.most_common(10)
527
-
528
- # Get example messages for top patterns
529
- pattern_examples = {}
530
- for pattern, count in most_common_patterns[:5]: # Top 5 patterns
531
- pattern_msgs = abuser_df[abuser_df['detected_patterns'].apply(lambda x: pattern in x)]
532
- if not pattern_msgs.empty:
533
- # Get highest scoring example
534
- example = pattern_msgs.iloc[pattern_msgs['abuse_score'].argmax()]
535
- pattern_examples[pattern] = {
536
- 'message': example['message'],
537
- 'abuse_score': example['abuse_score'],
538
- 'timestamp': example['timestamp'].strftime('%Y-%m-%d %H:%M'),
539
- 'frequency': count
540
- }
541
-
542
- # Temporal patterns
543
- abuser_df_copy = abuser_df.copy()
544
- abuser_df_copy['hour'] = abuser_df_copy['timestamp'].dt.hour
545
- abuser_df_copy['day_of_week'] = abuser_df_copy['timestamp'].dt.day_name()
546
-
547
- # Peak abuse times
548
- hour_abuse = abuser_df_copy.groupby('hour')['abuse_score'].mean()
549
- peak_hours = hour_abuse.nlargest(3).index.tolist() if not hour_abuse.empty else []
550
-
551
- day_abuse = abuser_df_copy.groupby('day_of_week')['abuse_score'].mean()
552
- peak_days = day_abuse.nlargest(3).index.tolist() if not day_abuse.empty else []
553
-
554
- # Response pattern analysis
555
- response_patterns = self._analyze_response_patterns(full_df, primary_abuser)
556
-
557
- # Escalation triggers
558
- escalation_triggers = self._identify_escalation_triggers(abuser_df, victim_df)
559
-
560
- # Emotional and psychological profile
561
- emotional_profile = {
562
- 'emotional_tones': Counter(abuser_df['emotional_tone']).most_common(5),
563
- 'avg_darvo_score': abuser_df['darvo_score'].mean(),
564
- 'high_darvo_incidents': len(abuser_df[abuser_df['darvo_score'] >= 0.65]),
565
- 'boundary_violations': len(abuser_df[abuser_df['boundary_health'] == 'unhealthy']),
566
- 'sentiment_distribution': Counter(abuser_df['sentiment']).most_common()
567
- }
568
-
569
- # Risk assessment
570
- risk_distribution = Counter(abuser_df['risk_level'])
571
- critical_incidents = abuser_df[abuser_df['risk_level'] == 'Critical']
572
-
573
- return {
574
- 'sender': primary_abuser,
575
- 'message_count': len(abuser_df),
576
- 'avg_abuse_score': avg_abuse,
577
- 'max_abuse_score': max_abuse,
578
- 'abusive_message_count': abusive_count,
579
- 'abusive_message_pct': abusive_pct,
580
- 'most_common_patterns': most_common_patterns,
581
- 'pattern_examples': pattern_examples,
582
- 'peak_hours': peak_hours,
583
- 'peak_days': peak_days,
584
- 'response_patterns': response_patterns,
585
- 'escalation_triggers': escalation_triggers,
586
- 'emotional_profile': emotional_profile,
587
- 'risk_distribution': dict(risk_distribution),
588
- 'critical_incidents': len(critical_incidents),
589
- 'critical_incident_examples': [
590
- {
591
- 'message': row['message'],
592
- 'score': row['abuse_score'],
593
- 'patterns': row['detected_patterns'],
594
- 'timestamp': row['timestamp'].strftime('%Y-%m-%d %H:%M')
595
- }
596
- for _, row in critical_incidents.head(3).iterrows()
597
- ] if not critical_incidents.empty else []
598
- }
599
-
600
- def _analyze_response_patterns(self, full_df, primary_abuser):
601
- """Analyze how the primary abuser responds to the victim"""
602
- response_patterns = []
603
-
604
- if len(full_df) < 10:
605
- return response_patterns
606
-
607
- sorted_df = full_df.sort_values('timestamp')
608
-
609
- for i in range(1, len(sorted_df)):
610
- current_msg = sorted_df.iloc[i]
611
- previous_msg = sorted_df.iloc[i-1]
612
-
613
- # Check if this is abuser responding to victim
614
- if (current_msg['sender'] == primary_abuser and
615
- previous_msg['sender'] != primary_abuser and
616
- current_msg['abuse_score'] >= 40): # Lowered threshold for response analysis
617
-
618
- response_patterns.append({
619
- 'trigger_message': previous_msg['message'][:100] + "..." if len(previous_msg['message']) > 100 else previous_msg['message'],
620
- 'trigger_sentiment': previous_msg['sentiment'],
621
- 'response_message': current_msg['message'][:100] + "..." if len(current_msg['message']) > 100 else current_msg['message'],
622
- 'response_score': current_msg['abuse_score'],
623
- 'response_patterns': current_msg['detected_patterns'],
624
- 'timestamp': current_msg['timestamp'].strftime('%Y-%m-%d %H:%M')
625
- })
626
-
627
- # Return top 5 most abusive responses
628
- return sorted(response_patterns, key=lambda x: x['response_score'], reverse=True)[:5]
629
-
630
- def _identify_escalation_triggers(self, abuser_df, victim_df):
631
- """Identify what triggers escalation in the abuser's behavior"""
632
- # This is a simplified version - could be expanded with more sophisticated analysis
633
- triggers = []
634
-
635
- # Look for patterns in high-abuse messages
636
- high_abuse_msgs = abuser_df[abuser_df['abuse_score'] >= 70]
637
-
638
- if not high_abuse_msgs.empty:
639
- # Common words/themes in high-abuse messages
640
- high_abuse_text = ' '.join(high_abuse_msgs['message'].str.lower())
641
-
642
- # Simple keyword analysis (could be enhanced with NLP)
643
- trigger_keywords = ['leave', 'divorce', 'break up', 'end', 'done', 'over', 'police', 'help', 'family', 'friends']
644
- found_triggers = [word for word in trigger_keywords if word in high_abuse_text]
645
-
646
- triggers.extend(found_triggers)
647
-
648
- return list(set(triggers)) # Remove duplicates
649
-
650
- def _generate_general_analysis(self, results_df, sender_abuse_metrics):
651
- """Generate general analysis when no primary abuser is identified"""
652
- from utils import detect_escalation_patterns, generate_safety_plan, generate_professional_recommendations
653
-
654
- # Calculate sender statistics for all participants
655
- sender_stats = {}
656
- for sender in results_df['sender'].unique():
657
- sender_df = results_df[results_df['sender'] == sender]
658
-
659
- avg_abuse = sender_df['abuse_score'].mean()
660
- max_abuse = sender_df['abuse_score'].max()
661
-
662
- all_patterns = []
663
- for patterns in sender_df['detected_patterns']:
664
- if patterns:
665
- all_patterns.extend(patterns)
666
-
667
- pattern_counts = Counter(all_patterns)
668
- most_common = pattern_counts.most_common(5)
669
-
670
- abusive_count = len(sender_df[sender_df['abuse_score'] >= 50])
671
- abusive_pct = (abusive_count / len(sender_df)) * 100 if len(sender_df) > 0 else 0
672
-
673
- tone_counts = Counter(sender_df['emotional_tone'])
674
- most_common_tones = tone_counts.most_common(3)
675
-
676
- avg_darvo = sender_df['darvo_score'].mean()
677
- high_darvo_count = len(sender_df[sender_df['darvo_score'] >= 0.65])
678
- high_darvo_pct = (high_darvo_count / len(sender_df)) * 100 if len(sender_df) > 0 else 0
679
-
680
- risk_counts = Counter(sender_df['risk_level'])
681
-
682
- sender_stats[sender] = {
683
- 'message_count': len(sender_df),
684
- 'avg_abuse_score': avg_abuse,
685
- 'max_abuse_score': max_abuse,
686
- 'abusive_message_count': abusive_count,
687
- 'abusive_message_pct': abusive_pct,
688
- 'common_patterns': most_common,
689
- 'emotional_tones': most_common_tones,
690
- 'avg_darvo_score': avg_darvo,
691
- 'high_
 
31
  "false equivalence": 0.317,
32
  "future faking": 0.385
33
  }
 
34
  def analyze_message(self, text):
35
  """Analyze a single message for abuse patterns"""
36
  from utils import (
 
185
  'boundary_assessment': {'assessment': 'error', 'confidence': 0.0},
186
  'risk_level': "Unknown"
187
  }
 
188
  def identify_primary_abuser(self, results_df):
189
  """Identify the primary abuser based on comprehensive abuse metrics with pattern severity weighting"""
190
  logger.info("Identifying primary abuser...")
 
295
 
296
  logger.info(f"Primary abuser identified: {primary_abuser}")
297
  return primary_abuser, sender_abuse_metrics
 
298
  def analyze_chat_history(self, df):
299
+ """Analyze entire chat history"""
300
  from utils import detect_escalation_patterns, generate_safety_plan, generate_professional_recommendations
301
 
302
  logger.info(f"Analyzing chat history with {len(df)} messages")
 
327
  results_df.at[i, 'boundary_health'] = analysis['boundary_assessment']['assessment']
328
  results_df.at[i, 'risk_level'] = analysis['risk_level']
329
 
330
+ # Calculate sender statistics
 
 
 
331
  sender_stats = {}
332
  for sender in results_df['sender'].unique():
333
  sender_df = results_df[results_df['sender'] == sender]
 
376
  'risk_levels': risk_counts
377
  }
378
 
379
+ # Identify primary abuser
380
+ primary_abuser, sender_abuse_metrics = self.identify_primary_abuser(results_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
+ # Detect escalation patterns
383
+ escalation_data = detect_escalation_patterns(results_df)
384
 
385
+ # Determine overall risk level
386
+ if results_df['risk_level'].isin(['Critical']).any():
 
387
  overall_risk = "Critical"
388
+ elif results_df['risk_level'].isin(['High']).any():
389
  overall_risk = "High"
390
+ elif results_df['risk_level'].isin(['Moderate']).any():
391
  overall_risk = "Moderate"
392
  else:
393
  overall_risk = "Low"
394
 
395
+ # Generate safety plan
396
+ all_patterns = []
397
+ for patterns in results_df['detected_patterns']:
398
  if patterns:
399
+ all_patterns.extend(patterns)
400
 
401
+ safety_plan = generate_safety_plan(overall_risk, all_patterns, escalation_data)
402
 
403
+ # Generate professional recommendations
404
+ recommendations = generate_professional_recommendations(results_df, escalation_data, overall_risk)
405
 
406
+ # Prepare summary
407
  summary = {
408
  'message_count': len(results_df),
 
 
409
  'date_range': {
410
  'start': results_df['timestamp'].min().strftime('%Y-%m-%d'),
411
  'end': results_df['timestamp'].max().strftime('%Y-%m-%d')
412
  },
413
  'overall_risk_level': overall_risk,
414
+ 'sender_stats': sender_stats,
415
+ 'sender_abuse_metrics': sender_abuse_metrics,
416
  'primary_abuser': primary_abuser,
 
417
  'escalation_data': escalation_data,
418
  'safety_plan': safety_plan,
419
+ 'recommendations': recommendations
 
420
  }
421
 
422
  return results_df, summary
 
434
  'sender_stats': {},
435
  'sender_abuse_metrics': {},
436
  'primary_abuser': None,
 
437
  'escalation_data': {},
438
  'safety_plan': "Error generating safety plan.",
439
+ 'recommendations': []
 
440
  }