ProfessorLeVesseur commited on
Commit
fc8abe1
·
verified ·
1 Parent(s): 14e49ed

Update data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +218 -31
data_processor.py CHANGED
@@ -1,14 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import os
3
- import re
4
  from huggingface_hub import InferenceClient
5
- # from graphviz import Digraph
6
 
7
  class DataProcessor:
8
  INTERVENTION_COLUMN = 'Did the intervention happen today?'
9
- ENGAGED_STR = 'Engaged (Respect, Responsibility, Effort)'
10
- PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
11
- NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
12
 
13
  def __init__(self, student_metrics_df=None):
14
  self.hf_api_key = os.getenv('HF_API_KEY')
@@ -17,6 +203,7 @@ class DataProcessor:
17
  self.client = InferenceClient(api_key=self.hf_api_key)
18
  self.student_metrics_df = student_metrics_df
19
 
 
20
  def read_excel(self, uploaded_file):
21
  return pd.read_excel(uploaded_file)
22
 
@@ -32,13 +219,6 @@ class DataProcessor:
32
  df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
33
  df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
34
  return df
35
-
36
- # def format_session_data(self, df):
37
- # df['Date of Session'] = pd.to_datetime(df['Date of Session'], errors='coerce').dt.date
38
- # df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
39
- # df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
40
- # df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
41
- # return df
42
 
43
  def safe_convert_to_time(self, series, format_str='%I:%M %p'):
44
  try:
@@ -87,6 +267,17 @@ class DataProcessor:
87
  'Total Number of Days Available': [total_days]
88
  })
89
 
 
 
 
 
 
 
 
 
 
 
 
90
  def compute_student_metrics(self, df):
91
  intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
92
  intervention_sessions_held = len(intervention_df)
@@ -98,7 +289,7 @@ class DataProcessor:
98
  student_data = intervention_df[[col]].copy()
99
  student_data[col] = student_data[col].fillna('Absent')
100
 
101
- attendance_values = student_data[col].apply(lambda x: 1 if x in [
102
  self.ENGAGED_STR,
103
  self.PARTIALLY_ENGAGED_STR,
104
  self.NOT_ENGAGED_STR
@@ -109,19 +300,16 @@ class DataProcessor:
109
  attendance_pct = round(attendance_pct)
110
 
111
  engagement_counts = {
112
- 'Engaged': 0,
113
- 'Partially Engaged': 0,
114
- 'Not Engaged': 0,
115
  'Absent': 0
116
  }
117
 
118
  for x in student_data[col]:
119
- if x == self.ENGAGED_STR:
120
- engagement_counts['Engaged'] += 1
121
- elif x == self.PARTIALLY_ENGAGED_STR:
122
- engagement_counts['Partially Engaged'] += 1
123
- elif x == self.NOT_ENGAGED_STR:
124
- engagement_counts['Not Engaged'] += 1
125
  else:
126
  engagement_counts['Absent'] += 1 # Count as Absent if not engaged
127
 
@@ -129,16 +317,16 @@ class DataProcessor:
129
  total_sessions = sum(engagement_counts.values())
130
 
131
  # Engagement (%)
132
- engagement_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
133
  engagement_pct = round(engagement_pct)
134
 
135
- engaged_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
136
  engaged_pct = round(engaged_pct)
137
 
138
- partially_engaged_pct = (engagement_counts['Partially Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
139
  partially_engaged_pct = round(partially_engaged_pct)
140
 
141
- not_engaged_pct = (engagement_counts['Not Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
142
  not_engaged_pct = round(not_engaged_pct)
143
 
144
  absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
@@ -155,11 +343,10 @@ class DataProcessor:
155
  'Attended ≥ 90%': attended_90,
156
  'Engagement ≥ 80%': engaged_80,
157
  'Attendance (%)': attendance_pct,
158
- # 'Attendance #': sessions_attended,
159
  'Engagement (%)': engagement_pct,
160
- 'Engaged (%)': engaged_pct,
161
- 'Partially Engaged (%)': partially_engaged_pct,
162
- 'Not Engaged (%)': not_engaged_pct,
163
  'Absent (%)': absent_pct
164
  }
165
 
@@ -167,7 +354,7 @@ class DataProcessor:
167
  student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
168
  student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
169
  return student_metrics_df
170
-
171
  def compute_average_metrics(self, student_metrics_df):
172
  # Calculate the attendance and engagement average percentages across students
173
  attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Calculate the average attendance percentage
 
1
+ # import pandas as pd
2
+ # import os
3
+ # import re
4
+ # from huggingface_hub import InferenceClient
5
+ # # from graphviz import Digraph
6
+
7
+ # class DataProcessor:
8
+ # INTERVENTION_COLUMN = 'Did the intervention happen today?'
9
+ # ENGAGED_STR = 'Engaged (Respect, Responsibility, Effort)'
10
+ # PARTIALLY_ENGAGED_STR = 'Partially Engaged (about 50%)'
11
+ # NOT_ENGAGED_STR = 'Not Engaged (less than 50%)'
12
+
13
+ # def __init__(self, student_metrics_df=None):
14
+ # self.hf_api_key = os.getenv('HF_API_KEY')
15
+ # if not self.hf_api_key:
16
+ # raise ValueError("HF_API_KEY not set in environment variables")
17
+ # self.client = InferenceClient(api_key=self.hf_api_key)
18
+ # self.student_metrics_df = student_metrics_df
19
+
20
+ # def read_excel(self, uploaded_file):
21
+ # return pd.read_excel(uploaded_file)
22
+
23
+ # def format_session_data(self, df):
24
+ # # Look for "Date of Session" or "Date" column
25
+ # date_column = next((col for col in df.columns if col in ["Date of Session", "Date"]), None)
26
+ # if date_column:
27
+ # df[date_column] = pd.to_datetime(df[date_column], errors='coerce').dt.date
28
+ # else:
29
+ # print("Warning: Neither 'Date of Session' nor 'Date' column found in the dataframe.")
30
+
31
+ # df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
32
+ # df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
33
+ # df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
34
+ # return df
35
+
36
+ # def safe_convert_to_time(self, series, format_str='%I:%M %p'):
37
+ # try:
38
+ # converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
39
+ # if format_str:
40
+ # return converted.dt.strftime(format_str)
41
+ # return converted
42
+ # except Exception as e:
43
+ # print(f"Error converting series to time: {e}")
44
+ # return series
45
+
46
+ # def safe_convert_to_datetime(self, series, format_str=None):
47
+ # try:
48
+ # converted = pd.to_datetime(series, errors='coerce')
49
+ # if format_str:
50
+ # return converted.dt.strftime(format_str)
51
+ # return converted
52
+ # except Exception as e:
53
+ # print(f"Error converting series to datetime: {e}")
54
+ # return series
55
+
56
+ # def replace_student_names_with_initials(self, df):
57
+ # updated_columns = []
58
+ # for col in df.columns:
59
+ # if col.startswith('Student Attendance'):
60
+ # match = re.match(r'Student Attendance \[(.+?)\]', col)
61
+ # if match:
62
+ # name = match.group(1)
63
+ # initials = ''.join([part[0] for part in name.split()])
64
+ # updated_columns.append(f'Student Attendance [{initials}]')
65
+ # else:
66
+ # updated_columns.append(col)
67
+ # else:
68
+ # updated_columns.append(col)
69
+ # df.columns = updated_columns
70
+ # return df
71
+
72
+ # def compute_intervention_statistics(self, df):
73
+ # total_days = len(df)
74
+ # sessions_held = df[self.INTERVENTION_COLUMN].str.strip().str.lower().eq('yes').sum()
75
+ # intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
76
+ # return pd.DataFrame({
77
+ # 'Intervention Dosage (%)': [round(intervention_frequency, 0)],
78
+ # 'Intervention Sessions Held': [sessions_held],
79
+ # 'Intervention Sessions Not Held': [total_days - sessions_held],
80
+ # 'Total Number of Days Available': [total_days]
81
+ # })
82
+
83
+ # def compute_student_metrics(self, df):
84
+ # intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
85
+ # intervention_sessions_held = len(intervention_df)
86
+ # student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
87
+
88
+ # student_metrics = {}
89
+ # for col in student_columns:
90
+ # student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
91
+ # student_data = intervention_df[[col]].copy()
92
+ # student_data[col] = student_data[col].fillna('Absent')
93
+
94
+ # attendance_values = student_data[col].apply(lambda x: 1 if x in [
95
+ # self.ENGAGED_STR,
96
+ # self.PARTIALLY_ENGAGED_STR,
97
+ # self.NOT_ENGAGED_STR
98
+ # ] else 0)
99
+
100
+ # sessions_attended = attendance_values.sum()
101
+ # attendance_pct = (sessions_attended / intervention_sessions_held) * 100 if intervention_sessions_held > 0 else 0
102
+ # attendance_pct = round(attendance_pct)
103
+
104
+ # engagement_counts = {
105
+ # 'Engaged': 0,
106
+ # 'Partially Engaged': 0,
107
+ # 'Not Engaged': 0,
108
+ # 'Absent': 0
109
+ # }
110
+
111
+ # for x in student_data[col]:
112
+ # if x == self.ENGAGED_STR:
113
+ # engagement_counts['Engaged'] += 1
114
+ # elif x == self.PARTIALLY_ENGAGED_STR:
115
+ # engagement_counts['Partially Engaged'] += 1
116
+ # elif x == self.NOT_ENGAGED_STR:
117
+ # engagement_counts['Not Engaged'] += 1
118
+ # else:
119
+ # engagement_counts['Absent'] += 1 # Count as Absent if not engaged
120
+
121
+ # # Calculate percentages for engagement states
122
+ # total_sessions = sum(engagement_counts.values())
123
+
124
+ # # Engagement (%)
125
+ # engagement_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
126
+ # engagement_pct = round(engagement_pct)
127
+
128
+ # engaged_pct = (engagement_counts['Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
129
+ # engaged_pct = round(engaged_pct)
130
+
131
+ # partially_engaged_pct = (engagement_counts['Partially Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
132
+ # partially_engaged_pct = round(partially_engaged_pct)
133
+
134
+ # not_engaged_pct = (engagement_counts['Not Engaged'] / total_sessions * 100) if total_sessions > 0 else 0
135
+ # not_engaged_pct = round(not_engaged_pct)
136
+
137
+ # absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
138
+ # absent_pct = round(absent_pct)
139
+
140
+ # # Determine if the student attended ≥ 90% of sessions
141
+ # attended_90 = "Yes" if attendance_pct >= 90 else "No"
142
+
143
+ # # Determine if the student was engaged ≥ 80% of the time
144
+ # engaged_80 = "Yes" if engaged_pct >= 80 else "No"
145
+
146
+ # # Store metrics in the required order
147
+ # student_metrics[student_name] = {
148
+ # 'Attended ≥ 90%': attended_90,
149
+ # 'Engagement ≥ 80%': engaged_80,
150
+ # 'Attendance (%)': attendance_pct,
151
+ # # 'Attendance #': sessions_attended,
152
+ # 'Engagement (%)': engagement_pct,
153
+ # 'Engaged (%)': engaged_pct,
154
+ # 'Partially Engaged (%)': partially_engaged_pct,
155
+ # 'Not Engaged (%)': not_engaged_pct,
156
+ # 'Absent (%)': absent_pct
157
+ # }
158
+
159
+ # # Create a DataFrame from student_metrics
160
+ # student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
161
+ # student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
162
+ # return student_metrics_df
163
+
164
+ # def compute_average_metrics(self, student_metrics_df):
165
+ # # Calculate the attendance and engagement average percentages across students
166
+ # attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Calculate the average attendance percentage
167
+ # engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Calculate the average engagement percentage
168
+
169
+ # # Round the averages to make them whole numbers
170
+ # attendance_avg_stats = round(attendance_avg_stats)
171
+ # engagement_avg_stats = round(engagement_avg_stats)
172
+
173
+ # return attendance_avg_stats, engagement_avg_stats
174
+
175
+ # def evaluate_student(self, row, attendance_threshold=90, engagement_threshold=80):
176
+ # if row["Attended ≥ 90%"] == "No":
177
+ # return "Address Attendance"
178
+ # elif row["Engagement ≥ 80%"] == "No":
179
+ # return "Address Engagement"
180
+ # return "Consider barriers, fidelity, and progress monitoring"
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+ import re
189
  import pandas as pd
190
  import os
 
191
  from huggingface_hub import InferenceClient
 
192
 
193
  class DataProcessor:
194
  INTERVENTION_COLUMN = 'Did the intervention happen today?'
195
+ ENGAGED_STR = 'Engaged'
196
+ PARTIALLY_ENGAGED_STR = 'Partially Engaged'
197
+ NOT_ENGAGED_STR = 'Not Engaged'
198
 
199
  def __init__(self, student_metrics_df=None):
200
  self.hf_api_key = os.getenv('HF_API_KEY')
 
203
  self.client = InferenceClient(api_key=self.hf_api_key)
204
  self.student_metrics_df = student_metrics_df
205
 
206
+
207
  def read_excel(self, uploaded_file):
208
  return pd.read_excel(uploaded_file)
209
 
 
219
  df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
220
  df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
221
  return df
 
 
 
 
 
 
 
222
 
223
  def safe_convert_to_time(self, series, format_str='%I:%M %p'):
224
  try:
 
267
  'Total Number of Days Available': [total_days]
268
  })
269
 
270
+ def classify_engagement(self, engagement_str):
271
+ engagement_str = engagement_str.lower()
272
+ if engagement_str.startswith(self.ENGAGED_STR.lower()):
273
+ return self.ENGAGED_STR
274
+ elif engagement_str.startswith(self.PARTIALLY_ENGAGED_STR.lower()):
275
+ return self.PARTIALLY_ENGAGED_STR
276
+ elif engagement_str.startswith(self.NOT_ENGAGED_STR.lower()):
277
+ return self.NOT_ENGAGED_STR
278
+ else:
279
+ return 'Unknown'
280
+
281
  def compute_student_metrics(self, df):
282
  intervention_df = df[df[self.INTERVENTION_COLUMN].str.strip().str.lower() == 'yes']
283
  intervention_sessions_held = len(intervention_df)
 
289
  student_data = intervention_df[[col]].copy()
290
  student_data[col] = student_data[col].fillna('Absent')
291
 
292
+ attendance_values = student_data[col].apply(lambda x: 1 if self.classify_engagement(x) in [
293
  self.ENGAGED_STR,
294
  self.PARTIALLY_ENGAGED_STR,
295
  self.NOT_ENGAGED_STR
 
300
  attendance_pct = round(attendance_pct)
301
 
302
  engagement_counts = {
303
+ self.ENGAGED_STR: 0,
304
+ self.PARTIALLY_ENGAGED_STR: 0,
305
+ self.NOT_ENGAGED_STR: 0,
306
  'Absent': 0
307
  }
308
 
309
  for x in student_data[col]:
310
+ classified_engagement = self.classify_engagement(x)
311
+ if classified_engagement in engagement_counts:
312
+ engagement_counts[classified_engagement] += 1
 
 
 
313
  else:
314
  engagement_counts['Absent'] += 1 # Count as Absent if not engaged
315
 
 
317
  total_sessions = sum(engagement_counts.values())
318
 
319
  # Engagement (%)
320
+ engagement_pct = (engagement_counts[self.ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
321
  engagement_pct = round(engagement_pct)
322
 
323
+ engaged_pct = (engagement_counts[self.ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
324
  engaged_pct = round(engaged_pct)
325
 
326
+ partially_engaged_pct = (engagement_counts[self.PARTIALLY_ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
327
  partially_engaged_pct = round(partially_engaged_pct)
328
 
329
+ not_engaged_pct = (engagement_counts[self.NOT_ENGAGED_STR] / total_sessions * 100) if total_sessions > 0 else 0
330
  not_engaged_pct = round(not_engaged_pct)
331
 
332
  absent_pct = (engagement_counts['Absent'] / total_sessions * 100) if total_sessions > 0 else 0
 
343
  'Attended ≥ 90%': attended_90,
344
  'Engagement ≥ 80%': engaged_80,
345
  'Attendance (%)': attendance_pct,
 
346
  'Engagement (%)': engagement_pct,
347
+ f'{self.ENGAGED_STR} (%)': engaged_pct,
348
+ f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct,
349
+ f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct,
350
  'Absent (%)': absent_pct
351
  }
352
 
 
354
  student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
355
  student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
356
  return student_metrics_df
357
+
358
  def compute_average_metrics(self, student_metrics_df):
359
  # Calculate the attendance and engagement average percentages across students
360
  attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Calculate the average attendance percentage