Spaces:

davidr70
/

eval_results

Sleeping

App Files Files Community

davidr70 commited on Apr 30

Commit

5f4f31d

1 Parent(s): 09d4cda

add ability to see text

Browse files

Files changed (2) hide show

app.py +38 -5
data_access.py +26 -13

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pandas as pd
 import logging
 from data_access import get_questions, get_source_finders, get_run_ids, get_baseline_rankers, \
-    get_unified_sources
 logger = logging.getLogger(__name__)
@@ -105,6 +105,24 @@ async def update_sources_list_async(question_option, source_finder_name, run_id:
     return df_display, stats, run_id_options, result_message,
 # Create Gradio app
 # Ensure we clean up when done
@@ -180,10 +198,19 @@ async def main():
         with gr.Row():
             gr.Markdown("# Sources Found")
         with gr.Row():
-            results_table = gr.DataFrame(
-                headers=['Source Finder', 'Run ID', 'Sugya ID', 'Tractate', 'Folio', 'Rank', 'Reason'],
-                interactive=False
-            )
             # download_button = gr.DownloadButton(
             #     label="Download Results as CSV",
@@ -194,6 +221,12 @@ async def main():
         # Set up event handlers
         question_dropdown.change(
             update_sources_list,
             inputs=[question_dropdown, source_finder_dropdown, run_id_dropdown, baseline_rankers_dropdown],

 import logging
 from data_access import get_questions, get_source_finders, get_run_ids, get_baseline_rankers, \
+    get_unified_sources, get_source_text
 logger = logging.getLogger(__name__)
     return df_display, stats, run_id_options, result_message,
+# Add a new function to handle row selection
+async def handle_row_selection_async(evt: gr.SelectData):
+    if evt is None or evt.value is None:
+        return "No source selected"
+    try:
+        # Get the ID from the selected row
+        tractate_chunk_id = evt.row_value[0]
+        # Get the source text
+        text = await get_source_text(tractate_chunk_id)
+        return text
+    except Exception as e:
+        return f"Error retrieving source text: {str(e)}"
+def handle_row_selection(evt: gr.SelectData):
+    return asyncio.run(handle_row_selection_async(evt))
 # Create Gradio app
 # Ensure we clean up when done
         with gr.Row():
             gr.Markdown("# Sources Found")
         with gr.Row():
+            with gr.Column(scale=3):
+                results_table = gr.DataFrame(
+                    headers=['id', 'tractate', 'folio', 'in_baseline', 'baseline_rank', 'in_source_run', 'source_run_rank', 'source_reason'],
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                source_text = gr.TextArea(
+                    value="Text of the source will appear here",
+                    lines=15,
+                    label="Source Text",
+                    interactive=False,
+                    elem_id="source_text"
+                )
             # download_button = gr.DownloadButton(
             #     label="Download Results as CSV",
         # Set up event handlers
+        results_table.select(
+            handle_row_selection,
+            inputs=None,
+            outputs=source_text
+        )
         question_dropdown.change(
             update_sources_list,
             inputs=[question_dropdown, source_finder_dropdown, run_id_dropdown, baseline_rankers_dropdown],

data_access.py CHANGED Viewed

@@ -80,8 +80,8 @@ async def calculate_baseline_vs_source_stats_for_question(baseline_sources , sou
     # for a given question_id and source_finder_id and run_id calculate the baseline vs source stats
     # e.g. overlap, high ranked overlap, etc.
     async with get_async_connection() as conn:
-        actual_sources_set = {s["sugya_id"] for s in source_runs_sources}
-        baseline_sources_set = {s["sugya_id"] for s in baseline_sources}
         # Calculate overlap
         overlap = actual_sources_set.intersection(baseline_sources_set)
@@ -89,8 +89,8 @@ async def calculate_baseline_vs_source_stats_for_question(baseline_sources , sou
         # only_in_2 = baseline_sources_set - actual_sources_set
         # Calculate high-ranked overlap (rank >= 4)
-        actual_high_ranked = {s["sugya_id"] for s in source_runs_sources if int(s["source_rank"]) >= 4}
-        baseline_high_ranked = {s["sugya_id"] for s in baseline_sources if int(s["baseline_rank"]) >= 4}
         high_ranked_overlap = actual_high_ranked.intersection(baseline_high_ranked)
@@ -118,16 +118,16 @@ async def get_unified_sources(question_id: int, source_finder_id: int, run_id: i
     async with get_async_connection() as conn:
         # Get sources from source_runs
         query_runs = """
-            SELECT sr.sugya_id, sr.rank as source_rank, sr.tractate, sr.folio, sr.reason as source_reason
-            FROM source_runs sr
             WHERE sr.question_id = $1 AND sr.source_finder_id = $2 AND sr.run_id = $3
         """
         source_runs = await conn.fetch(query_runs, question_id, source_finder_id, run_id)
         # Get sources from baseline_sources
         query_baseline = """
-            SELECT bs.sugya_id, bs.rank as baseline_rank, bs.tractate, bs.folio
-            FROM baseline_sources bs
             WHERE bs.question_id = $1 AND bs.ranker_id = $2
         """
         baseline_sources = await conn.fetch(query_baseline, question_id, ranker_id)
@@ -135,8 +135,8 @@ async def get_unified_sources(question_id: int, source_finder_id: int, run_id: i
         stats_df = await calculate_baseline_vs_source_stats_for_question(baseline_sources, source_runs)
         # Convert to dictionaries for easier lookup
-        source_runs_dict = {s["sugya_id"]: dict(s) for s in source_runs}
-        baseline_dict = {s["sugya_id"]: dict(s) for s in baseline_sources}
         # Get all unique sugya_ids
         all_sugya_ids = set(source_runs_dict.keys()) | set(baseline_dict.keys())
@@ -151,9 +151,9 @@ async def get_unified_sources(question_id: int, source_finder_id: int, run_id: i
             else:
                 info = source_runs_dict[sugya_id]
             result = {
-                "sugya_id": sugya_id,
-                "tractate": info.get("tractate", "N/A"),
-                "folio": info.get("folio", "N/A"),
                 "in_baseline": "Yes" if in_baseline else "No",
                 "baseline_rank": baseline_dict.get(sugya_id, {}).get("baseline_rank", "N/A"),
                 "in_source_run": "Yes" if in_source_run else "No",
@@ -166,6 +166,19 @@ async def get_unified_sources(question_id: int, source_finder_id: int, run_id: i
         return unified_results, stats_df
 def get_pg_sync_connection(schema="talmudexplore"):
     conn = psycopg2.connect(dbname=os.getenv("pg_dbname"),
         user=os.getenv("pg_user"),

     # for a given question_id and source_finder_id and run_id calculate the baseline vs source stats
     # e.g. overlap, high ranked overlap, etc.
     async with get_async_connection() as conn:
+        actual_sources_set = {s["id"] for s in source_runs_sources}
+        baseline_sources_set = {s["id"] for s in baseline_sources}
         # Calculate overlap
         overlap = actual_sources_set.intersection(baseline_sources_set)
         # only_in_2 = baseline_sources_set - actual_sources_set
         # Calculate high-ranked overlap (rank >= 4)
+        actual_high_ranked = {s["id"] for s in source_runs_sources if int(s["source_rank"]) >= 4}
+        baseline_high_ranked = {s["id"] for s in baseline_sources if int(s["baseline_rank"]) >= 4}
         high_ranked_overlap = actual_high_ranked.intersection(baseline_high_ranked)
     async with get_async_connection() as conn:
         # Get sources from source_runs
         query_runs = """
+            SELECT tb.tractate_chunk_id as id, sr.rank as source_rank, sr.tractate, sr.folio, sr.reason as source_reason
+            FROM source_runs sr join talmud_bavli tb on sr.sugya_id = tb.xml_id
             WHERE sr.question_id = $1 AND sr.source_finder_id = $2 AND sr.run_id = $3
         """
         source_runs = await conn.fetch(query_runs, question_id, source_finder_id, run_id)
         # Get sources from baseline_sources
         query_baseline = """
+            SELECT tb.tractate_chunk_id as id, bs.rank as baseline_rank, bs.tractate, bs.folio
+            FROM baseline_sources bs join talmud_bavli tb on bs.sugya_id = tb.xml_id
             WHERE bs.question_id = $1 AND bs.ranker_id = $2
         """
         baseline_sources = await conn.fetch(query_baseline, question_id, ranker_id)
         stats_df = await calculate_baseline_vs_source_stats_for_question(baseline_sources, source_runs)
         # Convert to dictionaries for easier lookup
+        source_runs_dict = {s["id"]: dict(s) for s in source_runs}
+        baseline_dict = {s["id"]: dict(s) for s in baseline_sources}
         # Get all unique sugya_ids
         all_sugya_ids = set(source_runs_dict.keys()) | set(baseline_dict.keys())
             else:
                 info = source_runs_dict[sugya_id]
             result = {
+                "id": sugya_id,
+                "tractate": info.get("tractate"),
+                "folio": info.get("folio"),
                 "in_baseline": "Yes" if in_baseline else "No",
                 "baseline_rank": baseline_dict.get(sugya_id, {}).get("baseline_rank", "N/A"),
                 "in_source_run": "Yes" if in_source_run else "No",
         return unified_results, stats_df
+async def get_source_text(tractate_chunk_id: int):
+    """
+    Retrieves the text content for a given tractate chunk ID.
+    """
+    async with get_async_connection() as conn:
+        query = """
+        SELECT tb.text_with_nikud as text
+        FROM talmud_bavli tb
+        WHERE tb.tractate_chunk_id = $1
+        """
+        result = await conn.fetchrow(query, tractate_chunk_id)
+        return result["text"] if result else "Source text not found"
 def get_pg_sync_connection(schema="talmudexplore"):
     conn = psycopg2.connect(dbname=os.getenv("pg_dbname"),
         user=os.getenv("pg_user"),