ProfessorLeVesseur commited on
Commit
ff67651
·
verified ·
1 Parent(s): 1ee1fe7

Update data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +20 -234
data_processor.py CHANGED
@@ -1,235 +1,3 @@
1
- # import re
2
- # import pandas as pd
3
- # import os
4
- # from huggingface_hub import InferenceClient
5
-
6
- # class DataProcessor:
7
- # INTERVENTION_COLUMN_OPTIONS = [
8
- # 'Did the intervention happen today?',
9
- # 'Did the intervention take place today?'
10
- # ]
11
- # YES_RESPONSES = ['yes', 'assessment day'] # Added this line
12
- # ENGAGED_STR = 'Engaged'
13
- # PARTIALLY_ENGAGED_STR = 'Partially Engaged'
14
- # NOT_ENGAGED_STR = 'Not Engaged'
15
-
16
- # def __init__(self, student_metrics_df=None):
17
- # self.hf_api_key = os.getenv('HF_API_KEY')
18
- # if not self.hf_api_key:
19
- # raise ValueError("HF_API_KEY not set in environment variables")
20
- # self.client = InferenceClient(api_key=self.hf_api_key)
21
- # self.student_metrics_df = student_metrics_df
22
- # self.intervention_column = None # Will be set when processing data
23
-
24
- # def read_excel(self, uploaded_file):
25
- # return pd.read_excel(uploaded_file)
26
-
27
- # def format_session_data(self, df):
28
- # date_column = next((col for col in df.columns if col in ["Date of Session", "Date"]), None)
29
- # if date_column:
30
- # df[date_column] = pd.to_datetime(df[date_column], errors='coerce').dt.date
31
- # else:
32
- # print("Warning: Neither 'Date of Session' nor 'Date' column found in the dataframe.")
33
-
34
- # df['Timestamp'] = self.safe_convert_to_datetime(df['Timestamp'], '%I:%M %p')
35
- # df['Session Start Time'] = self.safe_convert_to_time(df['Session Start Time'], '%I:%M %p')
36
- # df['Session End Time'] = self.safe_convert_to_time(df['Session End Time'], '%I:%M %p')
37
- # return df
38
-
39
- # def safe_convert_to_time(self, series, format_str='%I:%M %p'):
40
- # try:
41
- # converted = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
42
- # if format_str:
43
- # return converted.dt.strftime(format_str)
44
- # return converted
45
- # except Exception as e:
46
- # print(f"Error converting series to time: {e}")
47
- # return series
48
-
49
- # def safe_convert_to_datetime(self, series, format_str=None):
50
- # try:
51
- # converted = pd.to_datetime(series, errors='coerce')
52
- # if format_str:
53
- # return converted.dt.strftime(format_str)
54
- # return converted
55
- # except Exception as e:
56
- # print(f"Error converting series to datetime: {e}")
57
- # return series
58
-
59
- # def replace_student_names_with_initials(self, df):
60
- # updated_columns = []
61
- # for col in df.columns:
62
- # if 'Student Attendance' in col:
63
- # # Search for the last occurrence of text within square brackets at the end of the string
64
- # match = re.search(r'\[(.+?)\]$', col)
65
- # if not match:
66
- # # Handle cases where the closing bracket might be missing
67
- # match = re.search(r'\[(.+)$', col)
68
- # if match:
69
- # name = match.group(1).strip()
70
- # # Remove any trailing closing bracket if it wasn't matched earlier
71
- # name = name.rstrip(']')
72
- # # Get initials
73
- # initials = ''.join([part[0] for part in name.strip().split()])
74
- # updated_col = f'Student Attendance [{initials}]'
75
- # updated_columns.append(updated_col)
76
- # else:
77
- # # If no match is found, keep the column name as is
78
- # updated_columns.append(col)
79
- # else:
80
- # updated_columns.append(col)
81
- # df.columns = updated_columns
82
- # return df
83
-
84
-
85
- # def find_intervention_column(self, df):
86
- # for column in self.INTERVENTION_COLUMN_OPTIONS:
87
- # if column in df.columns:
88
- # self.intervention_column = column
89
- # return column
90
- # raise ValueError("No intervention column found in the dataframe.")
91
-
92
- # def get_intervention_column(self, df):
93
- # if self.intervention_column is None:
94
- # self.intervention_column = self.find_intervention_column(df)
95
- # return self.intervention_column
96
-
97
- # def compute_intervention_statistics(self, df):
98
- # intervention_column = self.get_intervention_column(df)
99
- # total_days = len(df)
100
- # sessions_held = df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES).sum() # Modified line
101
- # intervention_frequency = (sessions_held / total_days) * 100 if total_days > 0 else 0
102
- # return pd.DataFrame({
103
- # 'Intervention Dosage (%)': [round(intervention_frequency, 0)],
104
- # 'Intervention Sessions Held': [sessions_held],
105
- # 'Intervention Sessions Not Held': [total_days - sessions_held],
106
- # 'Total Number of Days Available': [total_days]
107
- # })
108
-
109
- # def classify_engagement(self, engagement_str):
110
- # engagement_str = str(engagement_str).lower()
111
- # if engagement_str.startswith(self.ENGAGED_STR.lower()):
112
- # return self.ENGAGED_STR
113
- # elif engagement_str.startswith(self.PARTIALLY_ENGAGED_STR.lower()):
114
- # return self.PARTIALLY_ENGAGED_STR
115
- # elif engagement_str.startswith(self.NOT_ENGAGED_STR.lower()):
116
- # return self.NOT_ENGAGED_STR
117
- # else:
118
- # return 'Unknown'
119
-
120
- # def compute_student_metrics(self, df):
121
- # intervention_column = self.get_intervention_column(df)
122
- # intervention_df = df[df[intervention_column].str.strip().str.lower().isin(self.YES_RESPONSES)]
123
- # intervention_sessions_held = len(intervention_df)
124
- # student_columns = [col for col in df.columns if col.startswith('Student Attendance')]
125
-
126
- # student_metrics = {}
127
- # for col in student_columns:
128
- # student_name = col.replace('Student Attendance [', '').replace(']', '').strip()
129
- # student_data = intervention_df[[col]].copy()
130
- # student_data[col] = student_data[col].fillna('Absent')
131
-
132
- # # Classify each entry
133
- # student_data['Engagement'] = student_data[col].apply(self.classify_engagement)
134
-
135
- # # Calculate attendance
136
- # attendance_values = student_data['Engagement'].apply(
137
- # lambda x: 1 if x in [self.ENGAGED_STR, self.PARTIALLY_ENGAGED_STR, self.NOT_ENGAGED_STR] else 0
138
- # )
139
-
140
- # sessions_attended = attendance_values.sum()
141
- # attendance_pct = (sessions_attended / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0
142
- # attendance_pct = round(attendance_pct)
143
-
144
- # # Engagement counts (excluding 'Absent')
145
- # engagement_counts = {
146
- # self.ENGAGED_STR: 0,
147
- # self.PARTIALLY_ENGAGED_STR: 0,
148
- # self.NOT_ENGAGED_STR: 0
149
- # }
150
-
151
- # # Count the engagement types, excluding 'Absent'
152
- # for x in student_data['Engagement']:
153
- # if x in engagement_counts:
154
- # engagement_counts[x] += 1
155
- # # 'Absent' is not counted in engagement_counts
156
-
157
- # total_present_sessions = sum(engagement_counts.values())
158
-
159
- # engaged_pct = (
160
- # (engagement_counts[self.ENGAGED_STR] / total_present_sessions * 100)
161
- # if total_present_sessions > 0 else 0
162
- # )
163
- # engaged_pct = round(engaged_pct)
164
-
165
- # partially_engaged_pct = (
166
- # (engagement_counts[self.PARTIALLY_ENGAGED_STR] / total_present_sessions * 100)
167
- # if total_present_sessions > 0 else 0
168
- # )
169
- # partially_engaged_pct = round(partially_engaged_pct)
170
-
171
- # not_engaged_pct = (
172
- # (engagement_counts[self.NOT_ENGAGED_STR] / total_present_sessions * 100)
173
- # if total_present_sessions > 0 else 0
174
- # )
175
- # not_engaged_pct = round(not_engaged_pct)
176
-
177
- # # Engagement percentage is based on Engaged and Partially Engaged sessions
178
- # engagement_pct = (
179
- # ((engagement_counts[self.ENGAGED_STR] + engagement_counts[self.PARTIALLY_ENGAGED_STR]) / total_present_sessions * 100)
180
- # if total_present_sessions > 0 else 0
181
- # )
182
- # engagement_pct = round(engagement_pct)
183
-
184
- # # Absent percentage (for reference, not used in engagement calculation)
185
- # absent_sessions = student_data['Engagement'].value_counts().get('Absent', 0)
186
- # absent_pct = (absent_sessions / intervention_sessions_held * 100) if intervention_sessions_held > 0 else 0
187
- # absent_pct = round(absent_pct)
188
-
189
- # # Determine if the student attended ≥ 90% of sessions
190
- # attended_90 = "Yes" if attendance_pct >= 90 else "No"
191
-
192
- # # Determine if the student was engaged ≥ 80% of the time
193
- # engaged_80 = "Yes" if engagement_pct >= 80 else "No"
194
-
195
- # # Store metrics
196
- # student_metrics[student_name] = {
197
- # 'Attended ≥ 90%': attended_90,
198
- # 'Engagement ≥ 80%': engaged_80,
199
- # 'Attendance (%)': attendance_pct,
200
- # 'Engagement (%)': engagement_pct,
201
- # f'{self.ENGAGED_STR} (%)': engaged_pct,
202
- # f'{self.PARTIALLY_ENGAGED_STR} (%)': partially_engaged_pct,
203
- # f'{self.NOT_ENGAGED_STR} (%)': not_engaged_pct,
204
- # 'Absent (%)': absent_pct
205
- # }
206
-
207
- # # Create a DataFrame from student_metrics
208
- # student_metrics_df = pd.DataFrame.from_dict(student_metrics, orient='index').reset_index()
209
- # student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
210
- # return student_metrics_df
211
-
212
- # def compute_average_metrics(self, student_metrics_df):
213
- # # Calculate the attendance and engagement average percentages across students
214
- # attendance_avg_stats = student_metrics_df['Attendance (%)'].mean() # Average attendance percentage
215
- # engagement_avg_stats = student_metrics_df['Engagement (%)'].mean() # Average engagement percentage
216
-
217
- # # Round the averages to whole numbers
218
- # attendance_avg_stats = round(attendance_avg_stats)
219
- # engagement_avg_stats = round(engagement_avg_stats)
220
-
221
- # return attendance_avg_stats, engagement_avg_stats
222
-
223
- # def evaluate_student(self, row, attendance_threshold=90, engagement_threshold=80):
224
- # if row["Attended ≥ 90%"] == "No":
225
- # return "Address Attendance"
226
- # elif row["Engagement ≥ 80%"] == "No":
227
- # return "Address Engagement"
228
- # else:
229
- # return "Consider barriers, fidelity, and progress monitoring"
230
-
231
-
232
-
233
  import re
234
  import pandas as pd
235
  import os
@@ -457,14 +225,32 @@ class DataProcessor:
457
  student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
458
  return student_metrics_df
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  def compute_average_metrics(self, student_metrics_df):
461
  # Filter out rows with NaN values (inactive students)
462
  active_students_df = student_metrics_df.dropna()
463
 
464
- # Calculate the attendance and engagement average percentages across active students
465
  attendance_avg_stats = active_students_df['Attendance (%)'].mean()
466
- engagement_avg_stats = active_students_df[f'{self.ENGAGED_STR} (%)'].mean()
467
 
 
 
 
 
 
468
  # Round the averages to whole numbers
469
  attendance_avg_stats = round(attendance_avg_stats)
470
  engagement_avg_stats = round(engagement_avg_stats)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import pandas as pd
3
  import os
 
225
  student_metrics_df.rename(columns={'index': 'Student'}, inplace=True)
226
  return student_metrics_df
227
 
228
+ # def compute_average_metrics(self, student_metrics_df):
229
+ # # Filter out rows with NaN values (inactive students)
230
+ # active_students_df = student_metrics_df.dropna()
231
+
232
+ # # Calculate the attendance and engagement average percentages across active students
233
+ # attendance_avg_stats = active_students_df['Attendance (%)'].mean()
234
+ # engagement_avg_stats = active_students_df[f'{self.ENGAGED_STR} (%)'].mean()
235
+
236
+ # # Round the averages to whole numbers
237
+ # attendance_avg_stats = round(attendance_avg_stats)
238
+ # engagement_avg_stats = round(engagement_avg_stats)
239
+
240
+ # return attendance_avg_stats, engagement_avg_stats
241
+
242
  def compute_average_metrics(self, student_metrics_df):
243
  # Filter out rows with NaN values (inactive students)
244
  active_students_df = student_metrics_df.dropna()
245
 
246
+ # Calculate the attendance average percentage across active students
247
  attendance_avg_stats = active_students_df['Attendance (%)'].mean()
 
248
 
249
+ # Calculate the engagement average percentage across active students
250
+ # Only consider 'Engaged' and 'Partially Engaged' percentages, exclude 'Absent'
251
+ total_engagement = active_students_df[f'{self.ENGAGED_STR} (%)'] + active_students_df[f'{self.PARTIALLY_ENGAGED_STR} (%)']
252
+ engagement_avg_stats = total_engagement.mean()
253
+
254
  # Round the averages to whole numbers
255
  attendance_avg_stats = round(attendance_avg_stats)
256
  engagement_avg_stats = round(engagement_avg_stats)