File size: 25,517 Bytes
b91146d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54f345b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
import pandas as pd
import numpy as np
from datetime import datetime
from typing import List, Dict, Any, Tuple
import spacy
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import networkx as nx
from scipy import stats
import logging
import json
from dataclasses import dataclass
from enum import Enum

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class TopicDifficulty(Enum):
    EASY = "easy"
    MODERATE = "moderate"
    DIFFICULT = "difficult"
    VERY_DIFFICULT = "very_difficult"


@dataclass
class QuestionMetrics:
    complexity_score: float
    follow_up_count: int
    clarification_count: int
    time_spent: float
    sentiment_score: float

@dataclass
class TopicInsights:
    difficulty_level: TopicDifficulty
    common_confusion_points: List[str]
    question_patterns: List[str]
    time_distribution: Dict[str, float]
    engagement_metrics: Dict[str, float]
    recommended_focus_areas: List[str]

    def to_dict(self):
        return {
            "difficulty_level": self.difficulty_level.value,  # Convert enum to its value
            "common_confusion_points": self.common_confusion_points,
            "question_patterns": self.question_patterns,
            "time_distribution": {str(k): v for k, v in self.time_distribution.items()},
            "engagement_metrics": self.engagement_metrics,
            "recommended_focus_areas": self.recommended_focus_areas,
        }

class PreClassAnalytics:
    def __init__(self, nlp_model: str = "en_core_web_lg"):
        """Initialize the analytics system with necessary components."""
        self.nlp = spacy.load(nlp_model)
        self.question_indicators = {
            "what", "why", "how", "when", "where", "which", "who", 
            "whose", "whom", "can", "could", "would", "will", "explain"
        }
        self.confusion_indicators = {
            "confused", "don't understand", "unclear", "not clear",
            "stuck", "difficult", "hard", "help", "explain again"
        }
        self.follow_up_indicators = {
            "also", "another", "additionally", "furthermore", "moreover",
            "besides", "related", "similarly", "again"
        }
        
    def preprocess_chat_history(self, chat_history: List[Dict]) -> pd.DataFrame:
        """Convert chat history to DataFrame with enhanced features."""
        messages = []
        for chat in chat_history:
            user_id = chat['user_id']['$oid']
            for msg in chat['messages']:
                try:
                    # Ensure the timestamp is in the correct format
                    if isinstance(msg['timestamp'], dict) and '$date' in msg['timestamp']:
                        timestamp = pd.to_datetime(msg['timestamp']['$date'])
                    elif isinstance(msg['timestamp'], str):
                        timestamp = pd.to_datetime(msg['timestamp'])
                    else:
                        raise ValueError("Invalid timestamp format")
                except Exception as e:
                    print(f"Error parsing timestamp: {msg['timestamp']}, error: {e}")
                    timestamp = pd.NaT  # Use NaT (Not a Time) for invalid timestamps
                
                messages.append({
                    'user_id': user_id,
                    'timestamp': timestamp,
                    'prompt': msg['prompt'],
                    'response': msg['response'],
                    'is_question': any(q in msg['prompt'].lower() for q in self.question_indicators),
                    'shows_confusion': any(c in msg['prompt'].lower() for c in self.confusion_indicators),
                    'is_followup': any(f in msg['prompt'].lower() for f in self.follow_up_indicators)
                })
        
        df = pd.DataFrame(messages)
        df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity)
        return df

    def extract_topic_hierarchies(self, df: pd.DataFrame) -> Dict[str, List[str]]:
        """Extract hierarchical topic relationships from conversations."""
        topic_hierarchy = defaultdict(list)
        
        for _, row in df.iterrows():
            doc = self.nlp(row['prompt'])
            
            # Extract main topics and subtopics using noun chunks and dependencies
            main_topics = []
            subtopics = []
            
            for chunk in doc.noun_chunks:
                if chunk.root.dep_ in ('nsubj', 'dobj'):
                    main_topics.append(chunk.text.lower())
                else:
                    subtopics.append(chunk.text.lower())
            
            # Build hierarchy
            for main_topic in main_topics:
                topic_hierarchy[main_topic].extend(subtopics)
        
        # Clean and deduplicate
        return {k: list(set(v)) for k, v in topic_hierarchy.items()}

    def analyze_topic_difficulty(self, df: pd.DataFrame, topic: str) -> TopicDifficulty:
        """Determine topic difficulty based on various metrics."""
        topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
        
        # Calculate difficulty indicators
        confusion_rate = topic_msgs['shows_confusion'].mean()
        question_rate = topic_msgs['is_question'].mean()
        follow_up_rate = topic_msgs['is_followup'].mean()
        avg_sentiment = topic_msgs['sentiment'].mean()
        
        # Calculate composite difficulty score
        difficulty_score = (
            confusion_rate * 0.4 +
            question_rate * 0.3 +
            follow_up_rate * 0.2 +
            (1 - (avg_sentiment + 1) / 2) * 0.1
        )
        
        # Map score to difficulty level
        if difficulty_score < 0.3:
            return TopicDifficulty.EASY
        elif difficulty_score < 0.5:
            return TopicDifficulty.MODERATE
        elif difficulty_score < 0.7:
            return TopicDifficulty.DIFFICULT
        else:
            return TopicDifficulty.VERY_DIFFICULT

    def identify_confusion_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
        """Identify common patterns in student confusion."""
        confused_msgs = df[
            (df['prompt'].str.contains(topic, case=False)) & 
            (df['shows_confusion'])
        ]['prompt']
        
        patterns = []
        for msg in confused_msgs:
            doc = self.nlp(msg)
            
            # Extract key phrases around confusion indicators
            for sent in doc.sents:
                for token in sent:
                    if token.text.lower() in self.confusion_indicators:
                        # Get context window around confusion indicator
                        context = sent.text
                        patterns.append(context)
        
        # Group similar patterns
        if patterns:
            vectorizer = TfidfVectorizer(ngram_range=(1, 3))
            tfidf_matrix = vectorizer.fit_transform(patterns)
            similarity_matrix = cosine_similarity(tfidf_matrix)
            
            # Cluster similar patterns
            G = nx.Graph()
            for i in range(len(patterns)):
                for j in range(i + 1, len(patterns)):
                    if similarity_matrix[i][j] > 0.5:  # Similarity threshold
                        G.add_edge(i, j)
            
            # Extract representative patterns from each cluster
            clusters = list(nx.connected_components(G))
            return [patterns[min(cluster)] for cluster in clusters]
        
        return []

    def analyze_question_patterns(self, df: pd.DataFrame, topic: str) -> List[str]:
        """Analyze patterns in student questions about the topic."""
        topic_questions = df[
            (df['prompt'].str.contains(topic, case=False)) & 
            (df['is_question'])
        ]['prompt']
        
        question_types = defaultdict(list)
        for question in topic_questions:
            doc = self.nlp(question)
            
            # Categorize questions
            if any(token.text.lower() in {"what", "define", "explain"} for token in doc):
                question_types["conceptual"].append(question)
            elif any(token.text.lower() in {"how", "steps", "process"} for token in doc):
                question_types["procedural"].append(question)
            elif any(token.text.lower() in {"why", "reason", "because"} for token in doc):
                question_types["reasoning"].append(question)
            else:
                question_types["other"].append(question)
        
        # Extract patterns from each category
        patterns = []
        for category, questions in question_types.items():
            if questions:
                vectorizer = TfidfVectorizer(ngram_range=(1, 3))
                tfidf_matrix = vectorizer.fit_transform(questions)
                
                # Get most representative questions
                feature_array = np.mean(tfidf_matrix.toarray(), axis=0)
                tfidf_sorting = np.argsort(feature_array)[::-1]
                features = vectorizer.get_feature_names_out()
                
                patterns.append(f"{category}: {' '.join(features[tfidf_sorting[:3]])}")
        
        return patterns

    def analyze_time_distribution(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
        """Analyze time spent on different aspects of the topic."""
        topic_msgs = df[df['prompt'].str.contains(topic, case=False)].copy()
        if len(topic_msgs) < 2:
            return {}
        
        topic_msgs['time_diff'] = topic_msgs['timestamp'].diff()
        
        # Calculate time distribution
        distribution = {
            'total_time': topic_msgs['time_diff'].sum().total_seconds() / 60,
            'avg_time_per_message': topic_msgs['time_diff'].mean().total_seconds() / 60,
            'max_time_gap': topic_msgs['time_diff'].max().total_seconds() / 60,
            'time_spent_on_questions': topic_msgs[topic_msgs['is_question']]['time_diff'].sum().total_seconds() / 60,
            'time_spent_on_confusion': topic_msgs[topic_msgs['shows_confusion']]['time_diff'].sum().total_seconds() / 60
        }
        
        return distribution

    def calculate_engagement_metrics(self, df: pd.DataFrame, topic: str) -> Dict[str, float]:
        """Calculate student engagement metrics for the topic."""
        topic_msgs = df[df['prompt'].str.contains(topic, case=False)]
        
        metrics = {
            'message_count': len(topic_msgs),
            'question_ratio': topic_msgs['is_question'].mean(),
            'confusion_ratio': topic_msgs['shows_confusion'].mean(),
            'follow_up_ratio': topic_msgs['is_followup'].mean(),
            'avg_sentiment': topic_msgs['sentiment'].mean(),
            'engagement_score': 0.0  # Will be calculated below
        }
        
        # Calculate engagement score
        metrics['engagement_score'] = (
            metrics['message_count'] * 0.3 +
            metrics['question_ratio'] * 0.25 +
            metrics['follow_up_ratio'] * 0.25 +
            (metrics['avg_sentiment'] + 1) / 2 * 0.2  # Normalize sentiment to 0-1
        )
        
        return metrics

    def generate_topic_insights(self, df: pd.DataFrame, topic: str) -> TopicInsights:
        """Generate comprehensive insights for a topic."""
        difficulty = self.analyze_topic_difficulty(df, topic)
        confusion_points = self.identify_confusion_patterns(df, topic)
        question_patterns = self.analyze_question_patterns(df, topic)
        time_distribution = self.analyze_time_distribution(df, topic)
        engagement_metrics = self.calculate_engagement_metrics(df, topic)
        
        # Generate recommended focus areas based on insights
        focus_areas = []
        
        if difficulty in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT):
            focus_areas.append("Fundamental concept reinforcement needed")
        
        if confusion_points:
            focus_areas.append(f"Address common confusion around: {', '.join(confusion_points[:3])}")
        
        if engagement_metrics['confusion_ratio'] > 0.3:
            focus_areas.append("Consider alternative teaching approaches")
        
        if time_distribution.get('time_spent_on_questions', 0) > time_distribution.get('total_time', 0) * 0.5:
            focus_areas.append("More practical examples or demonstrations needed")
        
        return TopicInsights(
            difficulty_level=difficulty,
            common_confusion_points=confusion_points,
            question_patterns=question_patterns,
            time_distribution=time_distribution,
            engagement_metrics=engagement_metrics,
            recommended_focus_areas=focus_areas
        )

    def analyze_student_progress(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Analyze individual student progress and learning patterns."""
        student_progress = {}
        
        for student_id in df['user_id'].unique():
            student_msgs = df[df['user_id'] == student_id]
            
            # Calculate student-specific metrics
            progress = {
                'total_messages': len(student_msgs),
                'questions_asked': student_msgs['is_question'].sum(),
                'confusion_instances': student_msgs['shows_confusion'].sum(),
                'avg_sentiment': student_msgs['sentiment'].mean(),
                'topic_engagement': {},
                'learning_pattern': self._identify_learning_pattern(student_msgs)
            }
            
            # Analyze topic-specific engagement
            topics = self.extract_topic_hierarchies(student_msgs)
            for topic in topics:
                topic_msgs = student_msgs[student_msgs['prompt'].str.contains(topic, case=False)]
                progress['topic_engagement'][topic] = {
                    'message_count': len(topic_msgs),
                    'confusion_rate': topic_msgs['shows_confusion'].mean(),
                    'sentiment_trend': stats.linregress(
                        range(len(topic_msgs)),
                        topic_msgs['sentiment']
                    ).slope
                }
            
            student_progress[student_id] = progress
        
        return student_progress

    def _identify_learning_pattern(self, student_msgs: pd.DataFrame) -> str:
        """Identify student's learning pattern based on their interaction style."""
        # Calculate key metrics
        question_ratio = student_msgs['is_question'].mean()
        confusion_ratio = student_msgs['shows_confusion'].mean()
        follow_up_ratio = student_msgs['is_followup'].mean()
        sentiment_trend = stats.linregress(
            range(len(student_msgs)),
            student_msgs['sentiment']
        ).slope
        
        # Identify pattern
        if question_ratio > 0.6:
            return "Inquisitive Learner"
        elif confusion_ratio > 0.4:
            return "Needs Additional Support"
        elif follow_up_ratio > 0.5:
            return "Deep Dive Learner"
        elif sentiment_trend > 0:
            return "Progressive Learner"
        else:
            return "Steady Learner"

    def generate_comprehensive_report(self, chat_history: List[Dict]) -> Dict[str, Any]:
        """Generate a comprehensive analytics report."""
        # Preprocess chat history
        df = self.preprocess_chat_history(chat_history)
        
        # Extract topics
        topics = self.extract_topic_hierarchies(df)
        
        report = {
            'topics': {},
            'student_progress': self.analyze_student_progress(df),
            'overall_metrics': {
                'total_conversations': len(df),
                'unique_students': df['user_id'].nunique(),
                'avg_sentiment': df['sentiment'].mean(),
                'most_discussed_topics': Counter(
                    topic for topics_list in topics.values() 
                    for topic in topics_list
                ).most_common(5)
            }
        }
        
        # Generate topic-specific insights
        for main_topic, subtopics in topics.items():
            subtopic_insights = {}
            for subtopic in subtopics:
                subtopic_insights[subtopic] = {
                    'insights': self.generate_topic_insights(df, subtopic),
                    'related_topics': [t for t in subtopics if t != subtopic],
                    'student_engagement': {
                        student_id: self.calculate_engagement_metrics(
                            df[df['user_id'] == student_id], 
                            subtopic
                        )
                        for student_id in df['user_id'].unique()
                    }
                }
            
            report['topics'][main_topic] = {
                'insights': self.generate_topic_insights(df, main_topic),
                'subtopics': subtopic_insights,
                'topic_relationships': {
                    'hierarchy_depth': len(subtopics),
                    'connection_strength': self._calculate_topic_connections(df, main_topic, subtopics),
                    'progression_path': self._identify_topic_progression(df, main_topic, subtopics)
                }
            }
        
        # Add temporal analysis
        report['temporal_analysis'] = {
            'daily_engagement': df.groupby(df['timestamp'].dt.date).agg({
                'user_id': 'count',
                'is_question': 'sum',
                'shows_confusion': 'sum',
                'sentiment': 'mean'
            }).to_dict(),
            'peak_activity_hours': df.groupby(df['timestamp'].dt.hour)['user_id'].count().nlargest(3).to_dict(),
            'learning_trends': self._analyze_learning_trends(df)
        }
        
        # Add recommendations
        report['recommendations'] = self._generate_recommendations(report)
        
        return report

    def _calculate_topic_connections(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> Dict[str, float]:
        """Calculate connection strength between topics based on co-occurrence."""
        connections = {}
        main_topic_msgs = df[df['prompt'].str.contains(main_topic, case=False)]
        
        for subtopic in subtopics:
            cooccurrence = df[
                df['prompt'].str.contains(main_topic, case=False) & 
                df['prompt'].str.contains(subtopic, case=False)
            ].shape[0]
            
            connection_strength = cooccurrence / len(main_topic_msgs) if len(main_topic_msgs) > 0 else 0
            connections[subtopic] = connection_strength
        
        return connections

    def _identify_topic_progression(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> List[str]:
        """Identify optimal topic progression path based on student interactions."""
        topic_difficulties = {}
        
        for subtopic in subtopics:
            difficulty = self.analyze_topic_difficulty(df, subtopic)
            topic_difficulties[subtopic] = difficulty.value
        
        # Sort subtopics by difficulty
        return sorted(subtopics, key=lambda x: topic_difficulties[x])

    def _analyze_learning_trends(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Analyze overall learning trends across the dataset."""
        return {
            'sentiment_trend': stats.linregress(
                range(len(df)),
                df['sentiment']
            )._asdict(),
            'confusion_trend': stats.linregress(
                range(len(df)),
                df['shows_confusion']
            )._asdict(),
            'engagement_progression': self._calculate_engagement_progression(df)
        }

    def _calculate_engagement_progression(self, df: pd.DataFrame) -> Dict[str, float]:
        """Calculate how student engagement changes over time."""
        df['week'] = df['timestamp'].dt.isocalendar().week
        weekly_engagement = df.groupby('week').agg({
            'is_question': 'mean',
            'shows_confusion': 'mean',
            'is_followup': 'mean',
            'sentiment': 'mean'
        })
        
        return {
            'question_trend': stats.linregress(
                range(len(weekly_engagement)),
                weekly_engagement['is_question']
            ).slope,
            'confusion_trend': stats.linregress(
                range(len(weekly_engagement)),
                weekly_engagement['shows_confusion']
            ).slope,
            'follow_up_trend': stats.linregress(
                range(len(weekly_engagement)),
                weekly_engagement['is_followup']
            ).slope,
            'sentiment_trend': stats.linregress(
                range(len(weekly_engagement)),
                weekly_engagement['sentiment']
            ).slope
        }

    def _generate_recommendations(self, report: Dict[str, Any]) -> List[str]:
        """Generate actionable recommendations based on the analysis."""
        recommendations = []
        
        # Analyze difficulty distribution
        difficult_topics = [
            topic for topic, data in report['topics'].items()
            if data['insights'].difficulty_level in 
            (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT)
        ]
        
        if difficult_topics:
            recommendations.append(
                f"Consider providing additional resources for challenging topics: {', '.join(difficult_topics)}"
            )
        
        # Analyze student engagement
        avg_engagement = np.mean([
            progress['questions_asked'] / progress['total_messages']
            for progress in report['student_progress'].values()
        ])
        
        if avg_engagement < 0.3:
            recommendations.append(
                "Implement more interactive elements to increase student engagement"
            )
        
        # Analyze temporal patterns
        peak_hours = list(report['temporal_analysis']['peak_activity_hours'].keys())
        recommendations.append(
            f"Consider scheduling additional support during peak activity hours: {peak_hours}"
        )
        
        # Analyze learning trends
        # sentiment_trend = report['temporal_analysis']['learning_trends']['sentiment_trend']
        # if sentiment_trend < 0:
        #     recommendations.append(
        #         "Review teaching approach to address declining student satisfaction"
        #     )
        # Analyze learning trends
        # Analyze learning trends
        sentiment_trend = report.get('temporal_analysis', {}).get('learning_trends', {}).get('sentiment_trend', None)
        if isinstance(sentiment_trend, (int, float)):
            if sentiment_trend < 0:
                recommendations.append(
                    "Review teaching approach to address declining student satisfaction"
                )
        elif isinstance(sentiment_trend, dict):
            # Handle the case where sentiment_trend is a dictionary
            print(f"Unexpected dict format for sentiment_trend: {sentiment_trend}")
        else:
            print(f"Unexpected type for sentiment_trend: {type(sentiment_trend)}")
        
        return recommendations

class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, TopicDifficulty):
            return obj.value
        if isinstance(obj, TopicInsights):
            return obj.to_dict()
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

def convert_insights_to_dict(report):
    for main_topic, data in report['topics'].items():
        if isinstance(data['insights'], TopicInsights):
            data['insights'] = data['insights'].to_dict()
        for subtopic, subdata in data['subtopics'].items():
            if isinstance(subdata['insights'], TopicInsights):
                subdata['insights'] = subdata['insights'].to_dict()

if __name__ == "__main__":
    # Load chat history data
    chat_history = None
    with open('sample_files/chat_history_corpus.json', 'r', encoding="utf-8") as file:
        chat_history = json.load(file)
    
    # Initialize analytics system
    analytics = PreClassAnalytics()
    
    # Generate comprehensive report
    report = analytics.generate_comprehensive_report(chat_history)

    # Convert insights to dictionary
    # convert_insights_to_dict(report)
    
    print(json.dumps(report, indent=4, cls=CustomJSONEncoder))
    # print(report)