Spaces:

davanstrien
/

huggingface-datasets-search-v2

Running on CPU Upgrade

App Files Files Community

davanstrien HF Staff commited on 14 days ago

Commit

756e837

1 Parent(s): ff54ca8

refactor: optimize database setup logging and limit sample sizes for clarity

Browse files

Files changed (1) hide show

main.py +45 -38

main.py CHANGED Viewed

@@ -148,12 +148,10 @@ def setup_database():
         sample_df = (
             df.select(["datasetId", "last_modified"])
             .sort("last_modified", descending=True)
-            .limit(10)
             .collect()
         )
-        logger.info("Sample of most recent incoming records:")
-        for row in sample_df.iter_rows():
-            logger.info(f"  {row[0]}: {row[1]}")
         if latest_update:
             logger.info(f"Filtering records newer than {latest_update}")
@@ -183,13 +181,11 @@ def setup_database():
                     df.select(["datasetId", "last_modified"])
                     .filter(pl.col("last_modified") <= latest_update)
                     .sort("last_modified", descending=True)
-                    .limit(5)
                     .collect()
                 )
                 if len(just_before) > 0:
-                    logger.info("Records just before cutoff:")
-                    for row in just_before.iter_rows():
-                        logger.info(f"  {row[0]}: {row[1]}")
         df = df.collect()
         total_rows = len(df)
@@ -210,20 +206,18 @@ def setup_database():
                 ids_to_upsert = batch_df.select(["datasetId"]).to_series().to_list()
-                # Log first few IDs being upserted
-                logger.info(f"Upserting IDs (first 5): {ids_to_upsert[:5]}")
-                # Check if any of these already exist
-                existing_check = dataset_collection.get(
-                    ids=ids_to_upsert[:5], include=["metadatas"]
-                )
-                if existing_check["ids"]:
-                    logger.info(
-                        f"Found {len(existing_check['ids'])} existing records in this batch sample"
                     )
-                    for idx, id_ in enumerate(existing_check["ids"]):
                         logger.info(
-                            f"  Existing: {id_} - last_modified: {existing_check['metadatas'][idx].get('last_modified')}"
                         )
                 dataset_collection.upsert(
@@ -293,27 +287,39 @@ def setup_database():
             model_latest_update = max(model_last_modifieds)
             logger.info(f"Most recent model record in DB from: {model_latest_update}")
-        # Always process models to handle updates (not just new additions)
-        should_update_models = True
         if model_latest_update:
-            schema = model_lazy_df.collect_schema()
-            select_columns = [
-                "modelId",
-                "summary",
-                "likes",
-                "downloads",
-                "last_modified",
-            ]
-            if "param_count" in schema:
-                logger.info("Found 'param_count' column in model data schema.")
-                select_columns.append("param_count")
-            else:
-                logger.warning(
-                    "'param_count' column not found in model data schema. Will add it with null values."
-                )
-            # Select specified columns and then collect
-            model_df = model_lazy_df.select(select_columns).collect()
             # If param_count was not in the original schema, add it now to the collected DataFrame
             if "param_count" not in model_df.columns:
@@ -322,6 +328,7 @@ def setup_database():
                 )
             total_rows = len(model_df)
             for i in range(0, total_rows, BATCH_SIZE):
                 batch_df = model_df.slice(i, min(BATCH_SIZE, total_rows - i))

         sample_df = (
             df.select(["datasetId", "last_modified"])
             .sort("last_modified", descending=True)
+            .limit(5)
             .collect()
         )
+        logger.info(f"Sample of most recent incoming records: {sample_df.rows()[:3]}")
         if latest_update:
             logger.info(f"Filtering records newer than {latest_update}")
                     df.select(["datasetId", "last_modified"])
                     .filter(pl.col("last_modified") <= latest_update)
                     .sort("last_modified", descending=True)
+                    .limit(3)
                     .collect()
                 )
                 if len(just_before) > 0:
+                    logger.info(f"Records just before cutoff: {just_before.rows()}")
         df = df.collect()
         total_rows = len(df)
                 ids_to_upsert = batch_df.select(["datasetId"]).to_series().to_list()
+                # Log progress for every batch
+                if i == 0 or (i // BATCH_SIZE + 1) % 5 == 0:  # Log every 5th batch
+                    logger.info(f"Upserting batch {i // BATCH_SIZE + 1} (sample IDs: {ids_to_upsert[:3]})")
+                # Check if any of these already exist (sample only)
+                if i == 0:  # Only log for first batch to reduce noise
+                    existing_check = dataset_collection.get(
+                        ids=ids_to_upsert[:3], include=["metadatas"]
                     )
+                    if existing_check["ids"]:
                         logger.info(
+                            f"Sample: {len(existing_check['ids'])} existing records being updated"
                         )
                 dataset_collection.upsert(
             model_latest_update = max(model_last_modifieds)
             logger.info(f"Most recent model record in DB from: {model_latest_update}")
+        # Set up model schema columns
+        schema = model_lazy_df.collect_schema()
+        select_columns = [
+            "modelId",
+            "summary",
+            "likes",
+            "downloads",
+            "last_modified",
+        ]
+        if "param_count" in schema:
+            logger.info("Found 'param_count' column in model data schema.")
+            select_columns.append("param_count")
+        else:
+            logger.warning(
+                "'param_count' column not found in model data schema. Will add it with null values."
+            )
+        # Filter and process only newer model records
+        model_df = model_lazy_df.select(select_columns)
+        # Apply timestamp filtering like we do for datasets
         if model_latest_update:
+            logger.info(f"Filtering model records newer than {model_latest_update}")
+            model_df = model_df.with_columns(pl.col("last_modified").str.to_datetime())
+            model_df = model_df.filter(pl.col("last_modified") > model_latest_update)
+            model_filtered_count = model_df.select(pl.len()).collect().item()
+            logger.info(f"Found {model_filtered_count} model records to update after filtering")
+        else:
+            model_filtered_count = model_df.select(pl.len()).collect().item()
+            logger.info(f"Initial model load: processing all {model_filtered_count} model records")
+        if model_filtered_count > 0:
+            model_df = model_df.collect()
             # If param_count was not in the original schema, add it now to the collected DataFrame
             if "param_count" not in model_df.columns:
                 )
             total_rows = len(model_df)
+            logger.info(f"Updating model collection with {total_rows} new records")
             for i in range(0, total_rows, BATCH_SIZE):
                 batch_df = model_df.slice(i, min(BATCH_SIZE, total_rows - i))