davanstrien HF Staff commited on
Commit
756e837
·
1 Parent(s): ff54ca8

refactor: optimize database setup logging and limit sample sizes for clarity

Browse files
Files changed (1) hide show
  1. main.py +45 -38
main.py CHANGED
@@ -148,12 +148,10 @@ def setup_database():
148
  sample_df = (
149
  df.select(["datasetId", "last_modified"])
150
  .sort("last_modified", descending=True)
151
- .limit(10)
152
  .collect()
153
  )
154
- logger.info("Sample of most recent incoming records:")
155
- for row in sample_df.iter_rows():
156
- logger.info(f" {row[0]}: {row[1]}")
157
 
158
  if latest_update:
159
  logger.info(f"Filtering records newer than {latest_update}")
@@ -183,13 +181,11 @@ def setup_database():
183
  df.select(["datasetId", "last_modified"])
184
  .filter(pl.col("last_modified") <= latest_update)
185
  .sort("last_modified", descending=True)
186
- .limit(5)
187
  .collect()
188
  )
189
  if len(just_before) > 0:
190
- logger.info("Records just before cutoff:")
191
- for row in just_before.iter_rows():
192
- logger.info(f" {row[0]}: {row[1]}")
193
 
194
  df = df.collect()
195
  total_rows = len(df)
@@ -210,20 +206,18 @@ def setup_database():
210
 
211
  ids_to_upsert = batch_df.select(["datasetId"]).to_series().to_list()
212
 
213
- # Log first few IDs being upserted
214
- logger.info(f"Upserting IDs (first 5): {ids_to_upsert[:5]}")
 
215
 
216
- # Check if any of these already exist
217
- existing_check = dataset_collection.get(
218
- ids=ids_to_upsert[:5], include=["metadatas"]
219
- )
220
- if existing_check["ids"]:
221
- logger.info(
222
- f"Found {len(existing_check['ids'])} existing records in this batch sample"
223
  )
224
- for idx, id_ in enumerate(existing_check["ids"]):
225
  logger.info(
226
- f" Existing: {id_} - last_modified: {existing_check['metadatas'][idx].get('last_modified')}"
227
  )
228
 
229
  dataset_collection.upsert(
@@ -293,27 +287,39 @@ def setup_database():
293
  model_latest_update = max(model_last_modifieds)
294
  logger.info(f"Most recent model record in DB from: {model_latest_update}")
295
 
296
- # Always process models to handle updates (not just new additions)
297
- should_update_models = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  if model_latest_update:
299
- schema = model_lazy_df.collect_schema()
300
- select_columns = [
301
- "modelId",
302
- "summary",
303
- "likes",
304
- "downloads",
305
- "last_modified",
306
- ]
307
- if "param_count" in schema:
308
- logger.info("Found 'param_count' column in model data schema.")
309
- select_columns.append("param_count")
310
- else:
311
- logger.warning(
312
- "'param_count' column not found in model data schema. Will add it with null values."
313
- )
314
 
315
- # Select specified columns and then collect
316
- model_df = model_lazy_df.select(select_columns).collect()
317
 
318
  # If param_count was not in the original schema, add it now to the collected DataFrame
319
  if "param_count" not in model_df.columns:
@@ -322,6 +328,7 @@ def setup_database():
322
  )
323
 
324
  total_rows = len(model_df)
 
325
 
326
  for i in range(0, total_rows, BATCH_SIZE):
327
  batch_df = model_df.slice(i, min(BATCH_SIZE, total_rows - i))
 
148
  sample_df = (
149
  df.select(["datasetId", "last_modified"])
150
  .sort("last_modified", descending=True)
151
+ .limit(5)
152
  .collect()
153
  )
154
+ logger.info(f"Sample of most recent incoming records: {sample_df.rows()[:3]}")
 
 
155
 
156
  if latest_update:
157
  logger.info(f"Filtering records newer than {latest_update}")
 
181
  df.select(["datasetId", "last_modified"])
182
  .filter(pl.col("last_modified") <= latest_update)
183
  .sort("last_modified", descending=True)
184
+ .limit(3)
185
  .collect()
186
  )
187
  if len(just_before) > 0:
188
+ logger.info(f"Records just before cutoff: {just_before.rows()}")
 
 
189
 
190
  df = df.collect()
191
  total_rows = len(df)
 
206
 
207
  ids_to_upsert = batch_df.select(["datasetId"]).to_series().to_list()
208
 
209
+ # Log progress for every batch
210
+ if i == 0 or (i // BATCH_SIZE + 1) % 5 == 0: # Log every 5th batch
211
+ logger.info(f"Upserting batch {i // BATCH_SIZE + 1} (sample IDs: {ids_to_upsert[:3]})")
212
 
213
+ # Check if any of these already exist (sample only)
214
+ if i == 0: # Only log for first batch to reduce noise
215
+ existing_check = dataset_collection.get(
216
+ ids=ids_to_upsert[:3], include=["metadatas"]
 
 
 
217
  )
218
+ if existing_check["ids"]:
219
  logger.info(
220
+ f"Sample: {len(existing_check['ids'])} existing records being updated"
221
  )
222
 
223
  dataset_collection.upsert(
 
287
  model_latest_update = max(model_last_modifieds)
288
  logger.info(f"Most recent model record in DB from: {model_latest_update}")
289
 
290
+ # Set up model schema columns
291
+ schema = model_lazy_df.collect_schema()
292
+ select_columns = [
293
+ "modelId",
294
+ "summary",
295
+ "likes",
296
+ "downloads",
297
+ "last_modified",
298
+ ]
299
+ if "param_count" in schema:
300
+ logger.info("Found 'param_count' column in model data schema.")
301
+ select_columns.append("param_count")
302
+ else:
303
+ logger.warning(
304
+ "'param_count' column not found in model data schema. Will add it with null values."
305
+ )
306
+
307
+ # Filter and process only newer model records
308
+ model_df = model_lazy_df.select(select_columns)
309
+
310
+ # Apply timestamp filtering like we do for datasets
311
  if model_latest_update:
312
+ logger.info(f"Filtering model records newer than {model_latest_update}")
313
+ model_df = model_df.with_columns(pl.col("last_modified").str.to_datetime())
314
+ model_df = model_df.filter(pl.col("last_modified") > model_latest_update)
315
+ model_filtered_count = model_df.select(pl.len()).collect().item()
316
+ logger.info(f"Found {model_filtered_count} model records to update after filtering")
317
+ else:
318
+ model_filtered_count = model_df.select(pl.len()).collect().item()
319
+ logger.info(f"Initial model load: processing all {model_filtered_count} model records")
 
 
 
 
 
 
 
320
 
321
+ if model_filtered_count > 0:
322
+ model_df = model_df.collect()
323
 
324
  # If param_count was not in the original schema, add it now to the collected DataFrame
325
  if "param_count" not in model_df.columns:
 
328
  )
329
 
330
  total_rows = len(model_df)
331
+ logger.info(f"Updating model collection with {total_rows} new records")
332
 
333
  for i in range(0, total_rows, BATCH_SIZE):
334
  batch_df = model_df.slice(i, min(BATCH_SIZE, total_rows - i))