Spaces:

davidr70
/

eval_results

Running

App Files Files Community

davidr70 commited on 13 days ago

Commit

873b70f

1 Parent(s): 5f4f31d

added features - to see totals

Browse files

Files changed (3) hide show

app.py +19 -9
data_access.py +76 -0
tests/test_db_layer.py +49 -1

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pandas as pd
 import logging
 from data_access import get_questions, get_source_finders, get_run_ids, get_baseline_rankers, \
-    get_unified_sources, get_source_text
 logger = logging.getLogger(__name__)
@@ -63,6 +63,23 @@ async def update_sources_list_async(question_option, source_finder_name, run_id:
     if not question_option:
         return gr.skip(), gr.skip(), gr.skip(), "No question selected"
     logger.info("processing update")
     # Extract question ID from selection
     question_id = questions_dict.get(question_option)
@@ -72,15 +89,8 @@ async def update_sources_list_async(question_option, source_finder_name, run_id:
         run_id = run_id_options[0]
     run_id_int = int(run_id)
-    if len(source_finder_name):
-        finder_id_int = source_finders_dict.get(source_finder_name)
-    else:
-        finder_id_int = None
-    if type(baseline_ranker_name) == list:
-        baseline_ranker_name = baseline_ranker_name[0]
-    baseline_ranker_id_int = 1 if len(baseline_ranker_name) == 0 else baseline_rankers_dict.get(baseline_ranker_name)
     source_runs = None
     stats = None
@@ -138,7 +148,7 @@ async def main():
                     with gr.Column(scale=1):
                         # Main content area
                         question_dropdown = gr.Dropdown(
-                            choices=question_options,
                             label="Select Question",
                             value=None,
                             interactive=True,

 import logging
 from data_access import get_questions, get_source_finders, get_run_ids, get_baseline_rankers, \
+    get_unified_sources, get_source_text, calculate_cumulative_statistics_for_all_questions
 logger = logging.getLogger(__name__)
     if not question_option:
         return gr.skip(), gr.skip(), gr.skip(), "No question selected"
     logger.info("processing update")
+    if type(baseline_ranker_name) == list:
+        baseline_ranker_name = baseline_ranker_name[0]
+    baseline_ranker_id_int = 1 if len(baseline_ranker_name) == 0 else baseline_rankers_dict.get(baseline_ranker_name)
+    if len(source_finder_name):
+        finder_id_int = source_finders_dict.get(source_finder_name)
+    else:
+        finder_id_int = None
+    if question_option == "All questions":
+        if finder_id_int and type(run_id) == str:
+            all_stats = await calculate_cumulative_statistics_for_all_questions(finder_id_int, int(run_id), baseline_ranker_id_int)
+        else:
+            all_stats = None
+        return None, all_stats, gr.skip(), "Select Run Id and source finder to see results"
     # Extract question ID from selection
     question_id = questions_dict.get(question_option)
         run_id = run_id_options[0]
     run_id_int = int(run_id)
     source_runs = None
     stats = None
                     with gr.Column(scale=1):
                         # Main content area
                         question_dropdown = gr.Dropdown(
+                            choices=["All questions"] + question_options,
                             label="Select Question",
                             value=None,
                             interactive=True,

data_access.py CHANGED Viewed

@@ -110,6 +110,82 @@ async def calculate_baseline_vs_source_stats_for_question(baseline_sources , sou
         return results_df
 async def get_unified_sources(question_id: int, source_finder_id: int, run_id: int, ranker_id: int):
     """
     Create unified view of sources from both baseline_sources and source_runs

         return results_df
+async def calculate_cumulative_statistics_for_all_questions(source_finder_id: int, run_id: int, ranker_id: int):
+    """
+    Calculate cumulative statistics across all questions for a specific source finder, run, and ranker.
+    Args:
+        source_finder_id (int): ID of the source finder
+        run_id (int): Run ID to analyze
+        ranker_id (int): ID of the baseline ranker
+    Returns:
+        pd.DataFrame: DataFrame containing aggregated statistics
+    """
+    async with get_async_connection() as conn:
+        # Get all questions
+        query = "SELECT id FROM questions ORDER BY id"
+        questions = await conn.fetch(query)
+        question_ids = [q["id"] for q in questions]
+        # Initialize aggregates
+        total_baseline_sources = 0
+        total_found_sources = 0
+        total_overlap = 0
+        total_high_ranked_baseline = 0
+        total_high_ranked_found = 0
+        total_high_ranked_overlap = 0
+        # Process each question
+        valid_questions = 0
+        for question_id in question_ids:
+            try:
+                # Get unified sources for this question
+                sources, stats = await get_unified_sources(question_id, source_finder_id, run_id, ranker_id)
+                if sources and len(sources) > 0:
+                    valid_questions += 1
+                    stats_dict = stats.iloc[0].to_dict()
+                    # Add to running totals
+                    total_baseline_sources += stats_dict.get('total_baseline_sources', 0)
+                    total_found_sources += stats_dict.get('total_found_sources', 0)
+                    total_overlap += stats_dict.get('overlap_count', 0)
+                    total_high_ranked_baseline += stats_dict.get('num_high_ranked_baseline_sources', 0)
+                    total_high_ranked_found += stats_dict.get('num_high_ranked_found_sources', 0)
+                    total_high_ranked_overlap += stats_dict.get('high_ranked_overlap_count', 0)
+            except Exception as e:
+                # Skip questions with errors
+                continue
+        # Calculate overall percentages
+        overlap_percentage = round(total_overlap * 100 / max(total_baseline_sources, total_found_sources), 2) \
+            if max(total_baseline_sources, total_found_sources) > 0 else 0
+        high_ranked_overlap_percentage = round(
+            total_high_ranked_overlap * 100 / max(total_high_ranked_baseline, total_high_ranked_found), 2) \
+            if max(total_high_ranked_baseline, total_high_ranked_found) > 0 else 0
+        # Compile results
+        cumulative_stats = {
+            "total_questions_analyzed": valid_questions,
+            "total_baseline_sources": total_baseline_sources,
+            "total_found_sources": total_found_sources,
+            "total_overlap_count": total_overlap,
+            "overall_overlap_percentage": overlap_percentage,
+            "total_high_ranked_baseline_sources": total_high_ranked_baseline,
+            "total_high_ranked_found_sources": total_high_ranked_found,
+            "total_high_ranked_overlap_count": total_high_ranked_overlap,
+            "overall_high_ranked_overlap_percentage": high_ranked_overlap_percentage,
+            "avg_baseline_sources_per_question": round(total_baseline_sources / valid_questions,
+                                                       2) if valid_questions > 0 else 0,
+            "avg_found_sources_per_question": round(total_found_sources / valid_questions,
+                                                    2) if valid_questions > 0 else 0
+        }
+        return pd.DataFrame([cumulative_stats])
 async def get_unified_sources(question_id: int, source_finder_id: int, run_id: int, ranker_id: int):
     """
     Create unified view of sources from both baseline_sources and source_runs

tests/test_db_layer.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import pytest
 from data_access import get_unified_sources
@@ -16,4 +18,50 @@ async def test_get_unified_sources():
     assert stats.shape[0] > 0, "Stats DataFrame should contain at least one row"
     # You can also check specific stats columns
-    assert "overlap_count" in stats.columns, "Stats should contain overlap_count"

+import pandas as pd
 import pytest
+from data_access import calculate_cumulative_statistics_for_all_questions
 from data_access import get_unified_sources
     assert stats.shape[0] > 0, "Stats DataFrame should contain at least one row"
     # You can also check specific stats columns
+    assert "overlap_count" in stats.columns, "Stats should contain overlap_count"
+@pytest.mark.asyncio
+async def test_calculate_cumulative_statistics_for_all_questions():
+    # Test with known source_finder_id, run_id, and ranker_id
+    source_finder_id = 2
+    run_id = 1
+    ranker_id = 1
+    # Call the function to test
+    result = await calculate_cumulative_statistics_for_all_questions(source_finder_id, run_id, ranker_id)
+    # Check basic structure of results
+    assert isinstance(result, pd.DataFrame), "Result should be a pandas DataFrame"
+    assert result.shape[0] == 1, "Result should have one row"
+    # Check required columns exist
+    expected_columns = [
+        "total_questions_analyzed",
+        "total_baseline_sources",
+        "total_found_sources",
+        "total_overlap_count",
+        "overall_overlap_percentage",
+        "total_high_ranked_baseline_sources",
+        "total_high_ranked_found_sources",
+        "total_high_ranked_overlap_count",
+        "overall_high_ranked_overlap_percentage",
+        "avg_baseline_sources_per_question",
+        "avg_found_sources_per_question"
+    ]
+    for column in expected_columns:
+        assert column in result.columns, f"Column {column} should be in result DataFrame"
+    # Check some basic value validations
+    assert result["total_questions_analyzed"].iloc[0] >= 0, "Should have zero or more questions analyzed"
+    assert result["total_baseline_sources"].iloc[0] >= 0, "Should have zero or more baseline sources"
+    assert result["total_found_sources"].iloc[0] >= 0, "Should have zero or more found sources"
+    # Check that percentages are within valid ranges
+    assert 0 <= result["overall_overlap_percentage"].iloc[0] <= 100, "Overlap percentage should be between 0 and 100"
+    assert 0 <= result["overall_high_ranked_overlap_percentage"].iloc[
+        0] <= 100, "High ranked overlap percentage should be between 0 and 100"