import pandas as pd import numpy as np from datetime import datetime from typing import List, Dict, Any, Tuple import spacy from collections import Counter, defaultdict from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from textblob import TextBlob import networkx as nx from scipy import stats import logging import json from dataclasses import dataclass from enum import Enum # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class TopicDifficulty(Enum): EASY = "easy" MODERATE = "moderate" DIFFICULT = "difficult" VERY_DIFFICULT = "very_difficult" @dataclass class QuestionMetrics: complexity_score: float follow_up_count: int clarification_count: int time_spent: float sentiment_score: float @dataclass class TopicInsights: difficulty_level: TopicDifficulty common_confusion_points: List[str] question_patterns: List[str] time_distribution: Dict[str, float] engagement_metrics: Dict[str, float] recommended_focus_areas: List[str] def to_dict(self): return { "difficulty_level": self.difficulty_level.value, # Convert enum to its value "common_confusion_points": self.common_confusion_points, "question_patterns": self.question_patterns, "time_distribution": {str(k): v for k, v in self.time_distribution.items()}, "engagement_metrics": self.engagement_metrics, "recommended_focus_areas": self.recommended_focus_areas, } class PreClassAnalytics: def __init__(self, nlp_model: str = "en_core_web_lg"): """Initialize the analytics system with necessary components.""" self.nlp = spacy.load(nlp_model) self.question_indicators = { "what", "why", "how", "when", "where", "which", "who", "whose", "whom", "can", "could", "would", "will", "explain" } self.confusion_indicators = { "confused", "don't understand", "unclear", "not clear", "stuck", "difficult", "hard", "help", "explain again" } self.follow_up_indicators = { "also", "another", "additionally", "furthermore", "moreover", "besides", "related", "similarly", "again" } def preprocess_chat_history(self, chat_history: List[Dict]) -> pd.DataFrame: """Convert chat history to DataFrame with enhanced features.""" messages = [] for chat in chat_history: user_id = chat['user_id']['$oid'] for msg in chat['messages']: try: # Ensure the timestamp is in the correct format if isinstance(msg['timestamp'], dict) and '$date' in msg['timestamp']: timestamp = pd.to_datetime(msg['timestamp']['$date']) elif isinstance(msg['timestamp'], str): timestamp = pd.to_datetime(msg['timestamp']) else: raise ValueError("Invalid timestamp format") except Exception as e: print(f"Error parsing timestamp: {msg['timestamp']}, error: {e}") timestamp = pd.NaT # Use NaT (Not a Time) for invalid timestamps messages.append({ 'user_id': user_id, 'timestamp': timestamp, 'prompt': msg['prompt'], 'response': msg['response'], 'is_question': any(q in msg['prompt'].lower() for q in self.question_indicators), 'shows_confusion': any(c in msg['prompt'].lower() for c in self.confusion_indicators), 'is_followup': any(f in msg['prompt'].lower() for f in self.follow_up_indicators) }) df = pd.DataFrame(messages) df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity) return df def extract_topic_hierarchies(self, df: pd.DataFrame) -> Dict[str, List[str]]: """Extract hierarchical topic relationships from conversations.""" topic_hierarchy = defaultdict(list) for _, row in df.iterrows(): doc = self.nlp(row['prompt']) # Extract main topics and subtopics using noun chunks and dependencies main_topics = [] subtopics = [] for chunk in doc.noun_chunks: if chunk.root.dep_ in ('nsubj', 'dobj'): main_topics.append(chunk.text.lower()) else: subtopics.append(chunk.text.lower()) # Build hierarchy for main_topic in main_topics: topic_hierarchy[main_topic].extend(subtopics) # Clean and deduplicate return {k: list(set(v)) for k, v in topic_hierarchy.items()} def analyze_topic_difficulty(self, df: pd.DataFrame, topic: str) -> TopicDifficulty: """Determine topic difficulty based on various metrics.""" topic_msgs = df[df['prompt'].str.contains(topic, case=False)] # Calculate difficulty indicators confusion_rate = topic_msgs['shows_confusion'].mean() question_rate = topic_msgs['is_question'].mean() follow_up_rate = topic_msgs['is_followup'].mean() avg_sentiment = topic_msgs['sentiment'].mean() # Calculate composite difficulty score difficulty_score = ( confusion_rate * 0.4 + question_rate * 0.3 + follow_up_rate * 0.2 + (1 - (avg_sentiment + 1) / 2) * 0.1 ) # Map score to difficulty level if difficulty_score < 0.3: return TopicDifficulty.EASY elif difficulty_score < 0.5: return TopicDifficulty.MODERATE elif difficulty_score < 0.7: return TopicDifficulty.DIFFICULT else: return TopicDifficulty.VERY_DIFFICULT def identify_confusion_patterns(self, df: pd.DataFrame, topic: str) -> List[str]: """Identify common patterns in student confusion.""" confused_msgs = df[ (df['prompt'].str.contains(topic, case=False)) & (df['shows_confusion']) ]['prompt'] patterns = [] for msg in confused_msgs: doc = self.nlp(msg) # Extract key phrases around confusion indicators for sent in doc.sents: for token in sent: if token.text.lower() in self.confusion_indicators: # Get context window around confusion indicator context = sent.text patterns.append(context) # Group similar patterns if patterns: vectorizer = TfidfVectorizer(ngram_range=(1, 3)) tfidf_matrix = vectorizer.fit_transform(patterns) similarity_matrix = cosine_similarity(tfidf_matrix) # Cluster similar patterns G = nx.Graph() for i in range(len(patterns)): for j in range(i + 1, len(patterns)): if similarity_matrix[i][j] > 0.5: # Similarity threshold G.add_edge(i, j) # Extract representative patterns from each cluster clusters = list(nx.connected_components(G)) return [patterns[min(cluster)] for cluster in clusters] return [] def analyze_question_patterns(self, df: pd.DataFrame, topic: str) -> List[str]: """Analyze patterns in student questions about the topic.""" topic_questions = df[ (df['prompt'].str.contains(topic, case=False)) & (df['is_question']) ]['prompt'] question_types = defaultdict(list) for question in topic_questions: doc = self.nlp(question) # Categorize questions if any(token.text.lower() in {"what", "define", "explain"} for token in doc): question_types["conceptual"].append(question) elif any(token.text.lower() in {"how", "steps", "process"} for token in doc): question_types["procedural"].append(question) elif any(token.text.lower() in {"why", "reason", "because"} for token in doc): question_types["reasoning"].append(question) else: question_types["other"].append(question) # Extract patterns from each category patterns = [] for category, questions in question_types.items(): if questions: vectorizer = TfidfVectorizer(ngram_range=(1, 3)) tfidf_matrix = vectorizer.fit_transform(questions) # Get most representative questions feature_array = np.mean(tfidf_matrix.toarray(), axis=0) tfidf_sorting = np.argsort(feature_array)[::-1] features = vectorizer.get_feature_names_out() patterns.append(f"{category}: {' '.join(features[tfidf_sorting[:3]])}") return patterns def analyze_time_distribution(self, df: pd.DataFrame, topic: str) -> Dict[str, float]: """Analyze time spent on different aspects of the topic.""" topic_msgs = df[df['prompt'].str.contains(topic, case=False)].copy() if len(topic_msgs) < 2: return {} topic_msgs['time_diff'] = topic_msgs['timestamp'].diff() # Calculate time distribution distribution = { 'total_time': topic_msgs['time_diff'].sum().total_seconds() / 60, 'avg_time_per_message': topic_msgs['time_diff'].mean().total_seconds() / 60, 'max_time_gap': topic_msgs['time_diff'].max().total_seconds() / 60, 'time_spent_on_questions': topic_msgs[topic_msgs['is_question']]['time_diff'].sum().total_seconds() / 60, 'time_spent_on_confusion': topic_msgs[topic_msgs['shows_confusion']]['time_diff'].sum().total_seconds() / 60 } return distribution def calculate_engagement_metrics(self, df: pd.DataFrame, topic: str) -> Dict[str, float]: """Calculate student engagement metrics for the topic.""" topic_msgs = df[df['prompt'].str.contains(topic, case=False)] metrics = { 'message_count': len(topic_msgs), 'question_ratio': topic_msgs['is_question'].mean(), 'confusion_ratio': topic_msgs['shows_confusion'].mean(), 'follow_up_ratio': topic_msgs['is_followup'].mean(), 'avg_sentiment': topic_msgs['sentiment'].mean(), 'engagement_score': 0.0 # Will be calculated below } # Calculate engagement score metrics['engagement_score'] = ( metrics['message_count'] * 0.3 + metrics['question_ratio'] * 0.25 + metrics['follow_up_ratio'] * 0.25 + (metrics['avg_sentiment'] + 1) / 2 * 0.2 # Normalize sentiment to 0-1 ) return metrics def generate_topic_insights(self, df: pd.DataFrame, topic: str) -> TopicInsights: """Generate comprehensive insights for a topic.""" difficulty = self.analyze_topic_difficulty(df, topic) confusion_points = self.identify_confusion_patterns(df, topic) question_patterns = self.analyze_question_patterns(df, topic) time_distribution = self.analyze_time_distribution(df, topic) engagement_metrics = self.calculate_engagement_metrics(df, topic) # Generate recommended focus areas based on insights focus_areas = [] if difficulty in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT): focus_areas.append("Fundamental concept reinforcement needed") if confusion_points: focus_areas.append(f"Address common confusion around: {', '.join(confusion_points[:3])}") if engagement_metrics['confusion_ratio'] > 0.3: focus_areas.append("Consider alternative teaching approaches") if time_distribution.get('time_spent_on_questions', 0) > time_distribution.get('total_time', 0) * 0.5: focus_areas.append("More practical examples or demonstrations needed") return TopicInsights( difficulty_level=difficulty, common_confusion_points=confusion_points, question_patterns=question_patterns, time_distribution=time_distribution, engagement_metrics=engagement_metrics, recommended_focus_areas=focus_areas ) def analyze_student_progress(self, df: pd.DataFrame) -> Dict[str, Any]: """Analyze individual student progress and learning patterns.""" student_progress = {} for student_id in df['user_id'].unique(): student_msgs = df[df['user_id'] == student_id] # Calculate student-specific metrics progress = { 'total_messages': len(student_msgs), 'questions_asked': student_msgs['is_question'].sum(), 'confusion_instances': student_msgs['shows_confusion'].sum(), 'avg_sentiment': student_msgs['sentiment'].mean(), 'topic_engagement': {}, 'learning_pattern': self._identify_learning_pattern(student_msgs) } # Analyze topic-specific engagement topics = self.extract_topic_hierarchies(student_msgs) for topic in topics: topic_msgs = student_msgs[student_msgs['prompt'].str.contains(topic, case=False)] progress['topic_engagement'][topic] = { 'message_count': len(topic_msgs), 'confusion_rate': topic_msgs['shows_confusion'].mean(), 'sentiment_trend': stats.linregress( range(len(topic_msgs)), topic_msgs['sentiment'] ).slope } student_progress[student_id] = progress return student_progress def _identify_learning_pattern(self, student_msgs: pd.DataFrame) -> str: """Identify student's learning pattern based on their interaction style.""" # Calculate key metrics question_ratio = student_msgs['is_question'].mean() confusion_ratio = student_msgs['shows_confusion'].mean() follow_up_ratio = student_msgs['is_followup'].mean() sentiment_trend = stats.linregress( range(len(student_msgs)), student_msgs['sentiment'] ).slope # Identify pattern if question_ratio > 0.6: return "Inquisitive Learner" elif confusion_ratio > 0.4: return "Needs Additional Support" elif follow_up_ratio > 0.5: return "Deep Dive Learner" elif sentiment_trend > 0: return "Progressive Learner" else: return "Steady Learner" def generate_comprehensive_report(self, chat_history: List[Dict]) -> Dict[str, Any]: """Generate a comprehensive analytics report.""" # Preprocess chat history df = self.preprocess_chat_history(chat_history) # Extract topics topics = self.extract_topic_hierarchies(df) report = { 'topics': {}, 'student_progress': self.analyze_student_progress(df), 'overall_metrics': { 'total_conversations': len(df), 'unique_students': df['user_id'].nunique(), 'avg_sentiment': df['sentiment'].mean(), 'most_discussed_topics': Counter( topic for topics_list in topics.values() for topic in topics_list ).most_common(5) } } # Generate topic-specific insights for main_topic, subtopics in topics.items(): subtopic_insights = {} for subtopic in subtopics: subtopic_insights[subtopic] = { 'insights': self.generate_topic_insights(df, subtopic), 'related_topics': [t for t in subtopics if t != subtopic], 'student_engagement': { student_id: self.calculate_engagement_metrics( df[df['user_id'] == student_id], subtopic ) for student_id in df['user_id'].unique() } } report['topics'][main_topic] = { 'insights': self.generate_topic_insights(df, main_topic), 'subtopics': subtopic_insights, 'topic_relationships': { 'hierarchy_depth': len(subtopics), 'connection_strength': self._calculate_topic_connections(df, main_topic, subtopics), 'progression_path': self._identify_topic_progression(df, main_topic, subtopics) } } # Add temporal analysis report['temporal_analysis'] = { 'daily_engagement': df.groupby(df['timestamp'].dt.date).agg({ 'user_id': 'count', 'is_question': 'sum', 'shows_confusion': 'sum', 'sentiment': 'mean' }).to_dict(), 'peak_activity_hours': df.groupby(df['timestamp'].dt.hour)['user_id'].count().nlargest(3).to_dict(), 'learning_trends': self._analyze_learning_trends(df) } # Add recommendations report['recommendations'] = self._generate_recommendations(report) return report def _calculate_topic_connections(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> Dict[str, float]: """Calculate connection strength between topics based on co-occurrence.""" connections = {} main_topic_msgs = df[df['prompt'].str.contains(main_topic, case=False)] for subtopic in subtopics: cooccurrence = df[ df['prompt'].str.contains(main_topic, case=False) & df['prompt'].str.contains(subtopic, case=False) ].shape[0] connection_strength = cooccurrence / len(main_topic_msgs) if len(main_topic_msgs) > 0 else 0 connections[subtopic] = connection_strength return connections def _identify_topic_progression(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> List[str]: """Identify optimal topic progression path based on student interactions.""" topic_difficulties = {} for subtopic in subtopics: difficulty = self.analyze_topic_difficulty(df, subtopic) topic_difficulties[subtopic] = difficulty.value # Sort subtopics by difficulty return sorted(subtopics, key=lambda x: topic_difficulties[x]) def _analyze_learning_trends(self, df: pd.DataFrame) -> Dict[str, Any]: """Analyze overall learning trends across the dataset.""" return { 'sentiment_trend': stats.linregress( range(len(df)), df['sentiment'] )._asdict(), 'confusion_trend': stats.linregress( range(len(df)), df['shows_confusion'] )._asdict(), 'engagement_progression': self._calculate_engagement_progression(df) } def _calculate_engagement_progression(self, df: pd.DataFrame) -> Dict[str, float]: """Calculate how student engagement changes over time.""" df['week'] = df['timestamp'].dt.isocalendar().week weekly_engagement = df.groupby('week').agg({ 'is_question': 'mean', 'shows_confusion': 'mean', 'is_followup': 'mean', 'sentiment': 'mean' }) return { 'question_trend': stats.linregress( range(len(weekly_engagement)), weekly_engagement['is_question'] ).slope, 'confusion_trend': stats.linregress( range(len(weekly_engagement)), weekly_engagement['shows_confusion'] ).slope, 'follow_up_trend': stats.linregress( range(len(weekly_engagement)), weekly_engagement['is_followup'] ).slope, 'sentiment_trend': stats.linregress( range(len(weekly_engagement)), weekly_engagement['sentiment'] ).slope } def _generate_recommendations(self, report: Dict[str, Any]) -> List[str]: """Generate actionable recommendations based on the analysis.""" recommendations = [] # Analyze difficulty distribution difficult_topics = [ topic for topic, data in report['topics'].items() if data['insights'].difficulty_level in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT) ] if difficult_topics: recommendations.append( f"Consider providing additional resources for challenging topics: {', '.join(difficult_topics)}" ) # Analyze student engagement avg_engagement = np.mean([ progress['questions_asked'] / progress['total_messages'] for progress in report['student_progress'].values() ]) if avg_engagement < 0.3: recommendations.append( "Implement more interactive elements to increase student engagement" ) # Analyze temporal patterns peak_hours = list(report['temporal_analysis']['peak_activity_hours'].keys()) recommendations.append( f"Consider scheduling additional support during peak activity hours: {peak_hours}" ) # Analyze learning trends # sentiment_trend = report['temporal_analysis']['learning_trends']['sentiment_trend'] # if sentiment_trend < 0: # recommendations.append( # "Review teaching approach to address declining student satisfaction" # ) # Analyze learning trends # Analyze learning trends sentiment_trend = report.get('temporal_analysis', {}).get('learning_trends', {}).get('sentiment_trend', None) if isinstance(sentiment_trend, (int, float)): if sentiment_trend < 0: recommendations.append( "Review teaching approach to address declining student satisfaction" ) elif isinstance(sentiment_trend, dict): # Handle the case where sentiment_trend is a dictionary print(f"Unexpected dict format for sentiment_trend: {sentiment_trend}") else: print(f"Unexpected type for sentiment_trend: {type(sentiment_trend)}") return recommendations class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, TopicDifficulty): return obj.value if isinstance(obj, TopicInsights): return obj.to_dict() if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() if isinstance(obj, datetime): return obj.isoformat() return super().default(obj) def convert_insights_to_dict(report): for main_topic, data in report['topics'].items(): if isinstance(data['insights'], TopicInsights): data['insights'] = data['insights'].to_dict() for subtopic, subdata in data['subtopics'].items(): if isinstance(subdata['insights'], TopicInsights): subdata['insights'] = subdata['insights'].to_dict() if __name__ == "__main__": # Load chat history data chat_history = None with open('sample_files/chat_history_corpus.json', 'r', encoding="utf-8") as file: chat_history = json.load(file) # Initialize analytics system analytics = PreClassAnalytics() # Generate comprehensive report report = analytics.generate_comprehensive_report(chat_history) # Convert insights to dictionary # convert_insights_to_dict(report) print(json.dumps(report, indent=4, cls=CustomJSONEncoder)) # print(report)