Spaces:
Build error
Build error
import pandas as pd | |
import numpy as np | |
from datetime import datetime | |
from typing import List, Dict, Any, Tuple | |
import spacy | |
from collections import Counter, defaultdict | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from textblob import TextBlob | |
import networkx as nx | |
from scipy import stats | |
import logging | |
import json | |
from dataclasses import dataclass | |
from enum import Enum | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class TopicDifficulty(Enum): | |
EASY = "easy" | |
MODERATE = "moderate" | |
DIFFICULT = "difficult" | |
VERY_DIFFICULT = "very_difficult" | |
class QuestionMetrics: | |
complexity_score: float | |
follow_up_count: int | |
clarification_count: int | |
time_spent: float | |
sentiment_score: float | |
class TopicInsights: | |
difficulty_level: TopicDifficulty | |
common_confusion_points: List[str] | |
question_patterns: List[str] | |
time_distribution: Dict[str, float] | |
engagement_metrics: Dict[str, float] | |
recommended_focus_areas: List[str] | |
def to_dict(self): | |
return { | |
"difficulty_level": self.difficulty_level.value, # Convert enum to its value | |
"common_confusion_points": self.common_confusion_points, | |
"question_patterns": self.question_patterns, | |
"time_distribution": {str(k): v for k, v in self.time_distribution.items()}, | |
"engagement_metrics": self.engagement_metrics, | |
"recommended_focus_areas": self.recommended_focus_areas, | |
} | |
class PreClassAnalytics: | |
def __init__(self, nlp_model: str = "en_core_web_lg"): | |
"""Initialize the analytics system with necessary components.""" | |
self.nlp = spacy.load(nlp_model) | |
self.question_indicators = { | |
"what", "why", "how", "when", "where", "which", "who", | |
"whose", "whom", "can", "could", "would", "will", "explain" | |
} | |
self.confusion_indicators = { | |
"confused", "don't understand", "unclear", "not clear", | |
"stuck", "difficult", "hard", "help", "explain again" | |
} | |
self.follow_up_indicators = { | |
"also", "another", "additionally", "furthermore", "moreover", | |
"besides", "related", "similarly", "again" | |
} | |
def preprocess_chat_history(self, chat_history: List[Dict]) -> pd.DataFrame: | |
"""Convert chat history to DataFrame with enhanced features.""" | |
messages = [] | |
for chat in chat_history: | |
user_id = chat['user_id']['$oid'] | |
for msg in chat['messages']: | |
try: | |
# Ensure the timestamp is in the correct format | |
if isinstance(msg['timestamp'], dict) and '$date' in msg['timestamp']: | |
timestamp = pd.to_datetime(msg['timestamp']['$date']) | |
elif isinstance(msg['timestamp'], str): | |
timestamp = pd.to_datetime(msg['timestamp']) | |
else: | |
raise ValueError("Invalid timestamp format") | |
except Exception as e: | |
print(f"Error parsing timestamp: {msg['timestamp']}, error: {e}") | |
timestamp = pd.NaT # Use NaT (Not a Time) for invalid timestamps | |
messages.append({ | |
'user_id': user_id, | |
'timestamp': timestamp, | |
'prompt': msg['prompt'], | |
'response': msg['response'], | |
'is_question': any(q in msg['prompt'].lower() for q in self.question_indicators), | |
'shows_confusion': any(c in msg['prompt'].lower() for c in self.confusion_indicators), | |
'is_followup': any(f in msg['prompt'].lower() for f in self.follow_up_indicators) | |
}) | |
df = pd.DataFrame(messages) | |
df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity) | |
return df | |
def extract_topic_hierarchies(self, df: pd.DataFrame) -> Dict[str, List[str]]: | |
"""Extract hierarchical topic relationships from conversations.""" | |
topic_hierarchy = defaultdict(list) | |
for _, row in df.iterrows(): | |
doc = self.nlp(row['prompt']) | |
# Extract main topics and subtopics using noun chunks and dependencies | |
main_topics = [] | |
subtopics = [] | |
for chunk in doc.noun_chunks: | |
if chunk.root.dep_ in ('nsubj', 'dobj'): | |
main_topics.append(chunk.text.lower()) | |
else: | |
subtopics.append(chunk.text.lower()) | |
# Build hierarchy | |
for main_topic in main_topics: | |
topic_hierarchy[main_topic].extend(subtopics) | |
# Clean and deduplicate | |
return {k: list(set(v)) for k, v in topic_hierarchy.items()} | |
def analyze_topic_difficulty(self, df: pd.DataFrame, topic: str) -> TopicDifficulty: | |
"""Determine topic difficulty based on various metrics.""" | |
topic_msgs = df[df['prompt'].str.contains(topic, case=False)] | |
# Calculate difficulty indicators | |
confusion_rate = topic_msgs['shows_confusion'].mean() | |
question_rate = topic_msgs['is_question'].mean() | |
follow_up_rate = topic_msgs['is_followup'].mean() | |
avg_sentiment = topic_msgs['sentiment'].mean() | |
# Calculate composite difficulty score | |
difficulty_score = ( | |
confusion_rate * 0.4 + | |
question_rate * 0.3 + | |
follow_up_rate * 0.2 + | |
(1 - (avg_sentiment + 1) / 2) * 0.1 | |
) | |
# Map score to difficulty level | |
if difficulty_score < 0.3: | |
return TopicDifficulty.EASY | |
elif difficulty_score < 0.5: | |
return TopicDifficulty.MODERATE | |
elif difficulty_score < 0.7: | |
return TopicDifficulty.DIFFICULT | |
else: | |
return TopicDifficulty.VERY_DIFFICULT | |
def identify_confusion_patterns(self, df: pd.DataFrame, topic: str) -> List[str]: | |
"""Identify common patterns in student confusion.""" | |
confused_msgs = df[ | |
(df['prompt'].str.contains(topic, case=False)) & | |
(df['shows_confusion']) | |
]['prompt'] | |
patterns = [] | |
for msg in confused_msgs: | |
doc = self.nlp(msg) | |
# Extract key phrases around confusion indicators | |
for sent in doc.sents: | |
for token in sent: | |
if token.text.lower() in self.confusion_indicators: | |
# Get context window around confusion indicator | |
context = sent.text | |
patterns.append(context) | |
# Group similar patterns | |
if patterns: | |
vectorizer = TfidfVectorizer(ngram_range=(1, 3)) | |
tfidf_matrix = vectorizer.fit_transform(patterns) | |
similarity_matrix = cosine_similarity(tfidf_matrix) | |
# Cluster similar patterns | |
G = nx.Graph() | |
for i in range(len(patterns)): | |
for j in range(i + 1, len(patterns)): | |
if similarity_matrix[i][j] > 0.5: # Similarity threshold | |
G.add_edge(i, j) | |
# Extract representative patterns from each cluster | |
clusters = list(nx.connected_components(G)) | |
return [patterns[min(cluster)] for cluster in clusters] | |
return [] | |
def analyze_question_patterns(self, df: pd.DataFrame, topic: str) -> List[str]: | |
"""Analyze patterns in student questions about the topic.""" | |
topic_questions = df[ | |
(df['prompt'].str.contains(topic, case=False)) & | |
(df['is_question']) | |
]['prompt'] | |
question_types = defaultdict(list) | |
for question in topic_questions: | |
doc = self.nlp(question) | |
# Categorize questions | |
if any(token.text.lower() in {"what", "define", "explain"} for token in doc): | |
question_types["conceptual"].append(question) | |
elif any(token.text.lower() in {"how", "steps", "process"} for token in doc): | |
question_types["procedural"].append(question) | |
elif any(token.text.lower() in {"why", "reason", "because"} for token in doc): | |
question_types["reasoning"].append(question) | |
else: | |
question_types["other"].append(question) | |
# Extract patterns from each category | |
patterns = [] | |
for category, questions in question_types.items(): | |
if questions: | |
vectorizer = TfidfVectorizer(ngram_range=(1, 3)) | |
tfidf_matrix = vectorizer.fit_transform(questions) | |
# Get most representative questions | |
feature_array = np.mean(tfidf_matrix.toarray(), axis=0) | |
tfidf_sorting = np.argsort(feature_array)[::-1] | |
features = vectorizer.get_feature_names_out() | |
patterns.append(f"{category}: {' '.join(features[tfidf_sorting[:3]])}") | |
return patterns | |
def analyze_time_distribution(self, df: pd.DataFrame, topic: str) -> Dict[str, float]: | |
"""Analyze time spent on different aspects of the topic.""" | |
topic_msgs = df[df['prompt'].str.contains(topic, case=False)].copy() | |
if len(topic_msgs) < 2: | |
return {} | |
topic_msgs['time_diff'] = topic_msgs['timestamp'].diff() | |
# Calculate time distribution | |
distribution = { | |
'total_time': topic_msgs['time_diff'].sum().total_seconds() / 60, | |
'avg_time_per_message': topic_msgs['time_diff'].mean().total_seconds() / 60, | |
'max_time_gap': topic_msgs['time_diff'].max().total_seconds() / 60, | |
'time_spent_on_questions': topic_msgs[topic_msgs['is_question']]['time_diff'].sum().total_seconds() / 60, | |
'time_spent_on_confusion': topic_msgs[topic_msgs['shows_confusion']]['time_diff'].sum().total_seconds() / 60 | |
} | |
return distribution | |
def calculate_engagement_metrics(self, df: pd.DataFrame, topic: str) -> Dict[str, float]: | |
"""Calculate student engagement metrics for the topic.""" | |
topic_msgs = df[df['prompt'].str.contains(topic, case=False)] | |
metrics = { | |
'message_count': len(topic_msgs), | |
'question_ratio': topic_msgs['is_question'].mean(), | |
'confusion_ratio': topic_msgs['shows_confusion'].mean(), | |
'follow_up_ratio': topic_msgs['is_followup'].mean(), | |
'avg_sentiment': topic_msgs['sentiment'].mean(), | |
'engagement_score': 0.0 # Will be calculated below | |
} | |
# Calculate engagement score | |
metrics['engagement_score'] = ( | |
metrics['message_count'] * 0.3 + | |
metrics['question_ratio'] * 0.25 + | |
metrics['follow_up_ratio'] * 0.25 + | |
(metrics['avg_sentiment'] + 1) / 2 * 0.2 # Normalize sentiment to 0-1 | |
) | |
return metrics | |
def generate_topic_insights(self, df: pd.DataFrame, topic: str) -> TopicInsights: | |
"""Generate comprehensive insights for a topic.""" | |
difficulty = self.analyze_topic_difficulty(df, topic) | |
confusion_points = self.identify_confusion_patterns(df, topic) | |
question_patterns = self.analyze_question_patterns(df, topic) | |
time_distribution = self.analyze_time_distribution(df, topic) | |
engagement_metrics = self.calculate_engagement_metrics(df, topic) | |
# Generate recommended focus areas based on insights | |
focus_areas = [] | |
if difficulty in (TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT): | |
focus_areas.append("Fundamental concept reinforcement needed") | |
if confusion_points: | |
focus_areas.append(f"Address common confusion around: {', '.join(confusion_points[:3])}") | |
if engagement_metrics['confusion_ratio'] > 0.3: | |
focus_areas.append("Consider alternative teaching approaches") | |
if time_distribution.get('time_spent_on_questions', 0) > time_distribution.get('total_time', 0) * 0.5: | |
focus_areas.append("More practical examples or demonstrations needed") | |
return TopicInsights( | |
difficulty_level=difficulty, | |
common_confusion_points=confusion_points, | |
question_patterns=question_patterns, | |
time_distribution=time_distribution, | |
engagement_metrics=engagement_metrics, | |
recommended_focus_areas=focus_areas | |
) | |
def analyze_student_progress(self, df: pd.DataFrame) -> Dict[str, Any]: | |
"""Analyze individual student progress and learning patterns.""" | |
student_progress = {} | |
for student_id in df['user_id'].unique(): | |
student_msgs = df[df['user_id'] == student_id] | |
# Calculate student-specific metrics | |
progress = { | |
'total_messages': len(student_msgs), | |
'questions_asked': student_msgs['is_question'].sum(), | |
'confusion_instances': student_msgs['shows_confusion'].sum(), | |
'avg_sentiment': student_msgs['sentiment'].mean(), | |
'topic_engagement': {}, | |
'learning_pattern': self._identify_learning_pattern(student_msgs) | |
} | |
# Analyze topic-specific engagement | |
topics = self.extract_topic_hierarchies(student_msgs) | |
for topic in topics: | |
topic_msgs = student_msgs[student_msgs['prompt'].str.contains(topic, case=False)] | |
progress['topic_engagement'][topic] = { | |
'message_count': len(topic_msgs), | |
'confusion_rate': topic_msgs['shows_confusion'].mean(), | |
'sentiment_trend': stats.linregress( | |
range(len(topic_msgs)), | |
topic_msgs['sentiment'] | |
).slope | |
} | |
student_progress[student_id] = progress | |
return student_progress | |
def _identify_learning_pattern(self, student_msgs: pd.DataFrame) -> str: | |
"""Identify student's learning pattern based on their interaction style.""" | |
# Calculate key metrics | |
question_ratio = student_msgs['is_question'].mean() | |
confusion_ratio = student_msgs['shows_confusion'].mean() | |
follow_up_ratio = student_msgs['is_followup'].mean() | |
sentiment_trend = stats.linregress( | |
range(len(student_msgs)), | |
student_msgs['sentiment'] | |
).slope | |
# Identify pattern | |
if question_ratio > 0.6: | |
return "Inquisitive Learner" | |
elif confusion_ratio > 0.4: | |
return "Needs Additional Support" | |
elif follow_up_ratio > 0.5: | |
return "Deep Dive Learner" | |
elif sentiment_trend > 0: | |
return "Progressive Learner" | |
else: | |
return "Steady Learner" | |
def generate_comprehensive_report(self, chat_history: List[Dict]) -> Dict[str, Any]: | |
"""Generate a comprehensive analytics report.""" | |
# Preprocess chat history | |
df = self.preprocess_chat_history(chat_history) | |
# Extract topics | |
topics = self.extract_topic_hierarchies(df) | |
report = { | |
'topics': {}, | |
'student_progress': self.analyze_student_progress(df), | |
'overall_metrics': { | |
'total_conversations': len(df), | |
'unique_students': df['user_id'].nunique(), | |
'avg_sentiment': df['sentiment'].mean(), | |
'most_discussed_topics': Counter( | |
topic for topics_list in topics.values() | |
for topic in topics_list | |
).most_common(5) | |
} | |
} | |
# Generate topic-specific insights | |
for main_topic, subtopics in topics.items(): | |
subtopic_insights = {} | |
for subtopic in subtopics: | |
subtopic_insights[subtopic] = { | |
'insights': self.generate_topic_insights(df, subtopic), | |
'related_topics': [t for t in subtopics if t != subtopic], | |
'student_engagement': { | |
student_id: self.calculate_engagement_metrics( | |
df[df['user_id'] == student_id], | |
subtopic | |
) | |
for student_id in df['user_id'].unique() | |
} | |
} | |
report['topics'][main_topic] = { | |
'insights': self.generate_topic_insights(df, main_topic), | |
'subtopics': subtopic_insights, | |
'topic_relationships': { | |
'hierarchy_depth': len(subtopics), | |
'connection_strength': self._calculate_topic_connections(df, main_topic, subtopics), | |
'progression_path': self._identify_topic_progression(df, main_topic, subtopics) | |
} | |
} | |
# Add temporal analysis | |
report['temporal_analysis'] = { | |
'daily_engagement': df.groupby(df['timestamp'].dt.date).agg({ | |
'user_id': 'count', | |
'is_question': 'sum', | |
'shows_confusion': 'sum', | |
'sentiment': 'mean' | |
}).to_dict(), | |
'peak_activity_hours': df.groupby(df['timestamp'].dt.hour)['user_id'].count().nlargest(3).to_dict(), | |
'learning_trends': self._analyze_learning_trends(df) | |
} | |
# Add recommendations | |
report['recommendations'] = self._generate_recommendations(report) | |
return report | |
def _calculate_topic_connections(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> Dict[str, float]: | |
"""Calculate connection strength between topics based on co-occurrence.""" | |
connections = {} | |
main_topic_msgs = df[df['prompt'].str.contains(main_topic, case=False)] | |
for subtopic in subtopics: | |
cooccurrence = df[ | |
df['prompt'].str.contains(main_topic, case=False) & | |
df['prompt'].str.contains(subtopic, case=False) | |
].shape[0] | |
connection_strength = cooccurrence / len(main_topic_msgs) if len(main_topic_msgs) > 0 else 0 | |
connections[subtopic] = connection_strength | |
return connections | |
def _identify_topic_progression(self, df: pd.DataFrame, main_topic: str, subtopics: List[str]) -> List[str]: | |
"""Identify optimal topic progression path based on student interactions.""" | |
topic_difficulties = {} | |
for subtopic in subtopics: | |
difficulty = self.analyze_topic_difficulty(df, subtopic) | |
topic_difficulties[subtopic] = difficulty.value | |
# Sort subtopics by difficulty | |
return sorted(subtopics, key=lambda x: topic_difficulties[x]) | |
def _analyze_learning_trends(self, df: pd.DataFrame) -> Dict[str, Any]: | |
"""Analyze overall learning trends across the dataset.""" | |
return { | |
'sentiment_trend': stats.linregress( | |
range(len(df)), | |
df['sentiment'] | |
)._asdict(), | |
'confusion_trend': stats.linregress( | |
range(len(df)), | |
df['shows_confusion'] | |
)._asdict(), | |
'engagement_progression': self._calculate_engagement_progression(df) | |
} | |
def _calculate_engagement_progression(self, df: pd.DataFrame) -> Dict[str, float]: | |
"""Calculate how student engagement changes over time.""" | |
df['week'] = df['timestamp'].dt.isocalendar().week | |
weekly_engagement = df.groupby('week').agg({ | |
'is_question': 'mean', | |
'shows_confusion': 'mean', | |
'is_followup': 'mean', | |
'sentiment': 'mean' | |
}) | |
return { | |
'question_trend': stats.linregress( | |
range(len(weekly_engagement)), | |
weekly_engagement['is_question'] | |
).slope, | |
'confusion_trend': stats.linregress( | |
range(len(weekly_engagement)), | |
weekly_engagement['shows_confusion'] | |
).slope, | |
'follow_up_trend': stats.linregress( | |
range(len(weekly_engagement)), | |
weekly_engagement['is_followup'] | |
).slope, | |
'sentiment_trend': stats.linregress( | |
range(len(weekly_engagement)), | |
weekly_engagement['sentiment'] | |
).slope | |
} | |
def _generate_recommendations(self, report: Dict[str, Any]) -> List[str]: | |
"""Generate actionable recommendations based on the analysis.""" | |
recommendations = [] | |
# Analyze difficulty distribution | |
difficult_topics = [ | |
topic for topic, data in report['topics'].items() | |
if data['insights'].difficulty_level in | |
(TopicDifficulty.DIFFICULT, TopicDifficulty.VERY_DIFFICULT) | |
] | |
if difficult_topics: | |
recommendations.append( | |
f"Consider providing additional resources for challenging topics: {', '.join(difficult_topics)}" | |
) | |
# Analyze student engagement | |
avg_engagement = np.mean([ | |
progress['questions_asked'] / progress['total_messages'] | |
for progress in report['student_progress'].values() | |
]) | |
if avg_engagement < 0.3: | |
recommendations.append( | |
"Implement more interactive elements to increase student engagement" | |
) | |
# Analyze temporal patterns | |
peak_hours = list(report['temporal_analysis']['peak_activity_hours'].keys()) | |
recommendations.append( | |
f"Consider scheduling additional support during peak activity hours: {peak_hours}" | |
) | |
# Analyze learning trends | |
# sentiment_trend = report['temporal_analysis']['learning_trends']['sentiment_trend'] | |
# if sentiment_trend < 0: | |
# recommendations.append( | |
# "Review teaching approach to address declining student satisfaction" | |
# ) | |
# Analyze learning trends | |
# Analyze learning trends | |
sentiment_trend = report.get('temporal_analysis', {}).get('learning_trends', {}).get('sentiment_trend', None) | |
if isinstance(sentiment_trend, (int, float)): | |
if sentiment_trend < 0: | |
recommendations.append( | |
"Review teaching approach to address declining student satisfaction" | |
) | |
elif isinstance(sentiment_trend, dict): | |
# Handle the case where sentiment_trend is a dictionary | |
print(f"Unexpected dict format for sentiment_trend: {sentiment_trend}") | |
else: | |
print(f"Unexpected type for sentiment_trend: {type(sentiment_trend)}") | |
return recommendations | |
class CustomJSONEncoder(json.JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, TopicDifficulty): | |
return obj.value | |
if isinstance(obj, TopicInsights): | |
return obj.to_dict() | |
if isinstance(obj, np.integer): | |
return int(obj) | |
if isinstance(obj, np.floating): | |
return float(obj) | |
if isinstance(obj, np.ndarray): | |
return obj.tolist() | |
if isinstance(obj, datetime): | |
return obj.isoformat() | |
return super().default(obj) | |
def convert_insights_to_dict(report): | |
for main_topic, data in report['topics'].items(): | |
if isinstance(data['insights'], TopicInsights): | |
data['insights'] = data['insights'].to_dict() | |
for subtopic, subdata in data['subtopics'].items(): | |
if isinstance(subdata['insights'], TopicInsights): | |
subdata['insights'] = subdata['insights'].to_dict() | |
if __name__ == "__main__": | |
# Load chat history data | |
chat_history = None | |
with open('sample_files/chat_history_corpus.json', 'r', encoding="utf-8") as file: | |
chat_history = json.load(file) | |
# Initialize analytics system | |
analytics = PreClassAnalytics() | |
# Generate comprehensive report | |
report = analytics.generate_comprehensive_report(chat_history) | |
# Convert insights to dictionary | |
# convert_insights_to_dict(report) | |
print(json.dumps(report, indent=4, cls=CustomJSONEncoder)) | |
# print(report) |