Spaces:

davidr70
/

eval_results

Running

App Files Files Community

eval_results / data_access.py

davidr70

changes to use new table and descriptive runs

312213e about 2 months ago

raw

history blame

12.1 kB

	import asyncio
	import os
	from contextlib import asynccontextmanager
	from typing import Optional

	import asyncpg
	import psycopg2
	from dotenv import load_dotenv
	import pandas as pd

	# Global connection pool
	load_dotenv()


	@asynccontextmanager
	async def get_async_connection(schema="talmudexplore"):
	"""Get a connection for the current request."""
	try:
	# Create a single connection without relying on a shared pool
	conn = await asyncpg.connect(
	database=os.getenv("pg_dbname"),
	user=os.getenv("pg_user"),
	password=os.getenv("pg_password"),
	host=os.getenv("pg_host"),
	port=os.getenv("pg_port")
	)
	await conn.execute(f'SET search_path TO {schema}')
	yield conn
	finally:
	await conn.close()


	async def get_questions():
	async with get_async_connection() as conn:
	questions = await conn.fetch("SELECT id, question_text FROM questions ORDER BY id")
	return [{"id": q["id"], "text": q["question_text"]} for q in questions]

	async def get_metadata(question_id: int, source_finder_id_run_id: int):
	async with get_async_connection() as conn:
	metadata = await conn.fetchrow('''
	SELECT metadata
	FROM source_finder_run_question_metadata sfrqm
	WHERE sfrqm.question_id = $1 and sfrqm.source_finder_run_id = $2;
	''', question_id, source_finder_id_run_id)
	if metadata is None:
	return ""
	return metadata.get('metadata')


	# Get distinct source finders
	async def get_source_finders():
	async with get_async_connection() as conn:
	finders = await conn.fetch("SELECT id, source_finder_type as name FROM source_finders ORDER BY id")
	return [{"id": f["id"], "name": f["name"]} for f in finders]


	# Get distinct run IDs for a question
	async def get_run_ids(question_id: int, source_finder_id: int):
	async with get_async_connection() as conn:
	query = """
	select distinct sfr.description, srs.source_finder_run_id as run_id
	from talmudexplore.source_run_results srs
	join talmudexplore.source_finder_runs sfr on srs.source_finder_run_id = sfr.id
	join talmudexplore.source_finders sf on sfr.source_finder_id = sf.id
	where sfr.source_finder_id = $1
	and srs.question_id = $2
	"""
	run_ids = await conn.fetch(query, source_finder_id, question_id)
	return {r["description"]:r["run_id"] for r in run_ids}


	async def get_baseline_rankers():
	async with get_async_connection() as conn:
	rankers = await conn.fetch("SELECT id, ranker FROM rankers ORDER BY id")
	return [{"id": f["id"], "name": f["ranker"]} for f in rankers]

	async def calculate_baseline_vs_source_stats_for_question(baseline_sources , source_runs_sources):
	# for a given question_id and source_finder_id and run_id calculate the baseline vs source stats
	# e.g. overlap, high ranked overlap, etc.
	async with get_async_connection() as conn:
	actual_sources_set = {s["id"] for s in source_runs_sources}
	baseline_sources_set = {s["id"] for s in baseline_sources}

	# Calculate overlap
	overlap = actual_sources_set.intersection(baseline_sources_set)
	# only_in_1 = actual_sources_set - baseline_sources_set
	# only_in_2 = baseline_sources_set - actual_sources_set

	# Calculate high-ranked overlap (rank >= 4)
	actual_high_ranked = {s["id"] for s in source_runs_sources if int(s["source_rank"]) >= 4}
	baseline_high_ranked = {s["id"] for s in baseline_sources if int(s["baseline_rank"]) >= 4}

	high_ranked_overlap = actual_high_ranked.intersection(baseline_high_ranked)

	results = {
	"total_baseline_sources": len(baseline_sources),
	"total_found_sources": len(source_runs_sources),
	"overlap_count": len(overlap),
	"overlap_percentage": round(len(overlap) * 100 / max(len(actual_sources_set), len(baseline_sources_set)),
	2) if max(len(actual_sources_set), len(baseline_sources_set)) > 0 else 0,
	"num_high_ranked_baseline_sources": len(baseline_high_ranked),
	"num_high_ranked_found_sources": len(actual_high_ranked),
	"high_ranked_overlap_count": len(high_ranked_overlap),
	"high_ranked_overlap_percentage": round(len(high_ranked_overlap) * 100 / max(len(actual_high_ranked), len(baseline_high_ranked)), 2) if max(len(actual_high_ranked), len(baseline_high_ranked)) > 0 else 0
	}
	#convert results to dataframe
	results_df = pd.DataFrame([results])
	return results_df


	async def calculate_cumulative_statistics_for_all_questions(source_finder_run_id: int, ranker_id: int):
	"""
	Calculate cumulative statistics across all questions for a specific source finder, run, and ranker.

	Args:
	source_finder_run_id (int): ID of the source finder and run as appears in source runs
	ranker_id (int): ID of the baseline ranker

	Returns:
	pd.DataFrame: DataFrame containing aggregated statistics
	"""
	async with get_async_connection() as conn:
	# Get all questions
	query = "SELECT id FROM questions ORDER BY id"
	questions = await conn.fetch(query)
	question_ids = [q["id"] for q in questions]

	# Initialize aggregates
	total_baseline_sources = 0
	total_found_sources = 0
	total_overlap = 0
	total_high_ranked_baseline = 0
	total_high_ranked_found = 0
	total_high_ranked_overlap = 0

	# Process each question
	valid_questions = 0
	for question_id in question_ids:
	try:
	# Get unified sources for this question
	stats, sources = await get_stats(conn, question_id, ranker_id, source_finder_run_id)

	if sources and len(sources) > 0:
	valid_questions += 1
	stats_dict = stats.iloc[0].to_dict()

	# Add to running totals
	total_baseline_sources += stats_dict.get('total_baseline_sources', 0)
	total_found_sources += stats_dict.get('total_found_sources', 0)
	total_overlap += stats_dict.get('overlap_count', 0)
	total_high_ranked_baseline += stats_dict.get('num_high_ranked_baseline_sources', 0)
	total_high_ranked_found += stats_dict.get('num_high_ranked_found_sources', 0)
	total_high_ranked_overlap += stats_dict.get('high_ranked_overlap_count', 0)
	except Exception as e:
	# Skip questions with errors
	continue

	# Calculate overall percentages
	overlap_percentage = round(total_overlap * 100 / max(total_baseline_sources, total_found_sources), 2) \
	if max(total_baseline_sources, total_found_sources) > 0 else 0

	high_ranked_overlap_percentage = round(
	total_high_ranked_overlap * 100 / max(total_high_ranked_baseline, total_high_ranked_found), 2) \
	if max(total_high_ranked_baseline, total_high_ranked_found) > 0 else 0

	# Compile results
	cumulative_stats = {
	"total_questions_analyzed": valid_questions,
	"total_baseline_sources": total_baseline_sources,
	"total_found_sources": total_found_sources,
	"total_overlap_count": total_overlap,
	"overall_overlap_percentage": overlap_percentage,
	"total_high_ranked_baseline_sources": total_high_ranked_baseline,
	"total_high_ranked_found_sources": total_high_ranked_found,
	"total_high_ranked_overlap_count": total_high_ranked_overlap,
	"overall_high_ranked_overlap_percentage": high_ranked_overlap_percentage,
	"avg_baseline_sources_per_question": round(total_baseline_sources / valid_questions,
	2) if valid_questions > 0 else 0,
	"avg_found_sources_per_question": round(total_found_sources / valid_questions,
	2) if valid_questions > 0 else 0
	}

	return pd.DataFrame([cumulative_stats])


	async def get_unified_sources(question_id: int, source_finder_run_id: int, ranker_id: int):
	"""
	Create unified view of sources from both baseline_sources and source_runs
	with indicators of where each source appears and their respective ranks.
	"""
	async with get_async_connection() as conn:
	stats_df, unified_results = await get_stats(conn, question_id, ranker_id, source_finder_run_id)

	return unified_results, stats_df


	async def get_stats(conn, question_id, ranker_id, source_finder_run_id):
	# Get sources from source_runs
	query_runs = """
	SELECT tb.tractate_chunk_id as id,
	sr.rank as source_rank,
	sr.tractate,
	sr.folio,
	sr.reason as source_reason
	FROM source_run_results sr
	join talmud_bavli tb on sr.sugya_id = tb.xml_id
	WHERE sr.question_id = $1
	AND sr.source_finder_run_id = $2
	"""
	source_runs = await conn.fetch(query_runs, question_id, source_finder_run_id)
	# Get sources from baseline_sources
	query_baseline = """
	SELECT tb.tractate_chunk_id as id, bs.rank as baseline_rank, bs.tractate, bs.folio
	FROM baseline_sources bs
	join talmud_bavli tb on bs.sugya_id = tb.xml_id
	WHERE bs.question_id = $1
	AND bs.ranker_id = $2
	"""
	baseline_sources = await conn.fetch(query_baseline, question_id, ranker_id)
	stats_df = await calculate_baseline_vs_source_stats_for_question(baseline_sources, source_runs)
	# Convert to dictionaries for easier lookup
	source_runs_dict = {s["id"]: dict(s) for s in source_runs}
	baseline_dict = {s["id"]: dict(s) for s in baseline_sources}
	# Get all unique sugya_ids
	all_sugya_ids = set(source_runs_dict.keys()) \| set(baseline_dict.keys())
	# Build unified results
	unified_results = []
	for sugya_id in all_sugya_ids:
	in_source_run = sugya_id in source_runs_dict
	in_baseline = sugya_id in baseline_dict
	if in_baseline:
	info = baseline_dict[sugya_id]
	else:
	info = source_runs_dict[sugya_id]
	result = {
	"id": sugya_id,
	"tractate": info.get("tractate"),
	"folio": info.get("folio"),
	"in_baseline": "Yes" if in_baseline else "No",
	"baseline_rank": baseline_dict.get(sugya_id, {}).get("baseline_rank", "N/A"),
	"in_source_run": "Yes" if in_source_run else "No",
	"source_run_rank": source_runs_dict.get(sugya_id, {}).get("source_rank", "N/A"),
	"source_reason": source_runs_dict.get(sugya_id, {}).get("reason", "N/A"),
	"metadata": source_runs_dict.get(sugya_id, {}).get("metadata", "")
	}
	unified_results.append(result)
	return stats_df, unified_results


	async def get_source_text(tractate_chunk_id: int):
	"""
	Retrieves the text content for a given tractate chunk ID.
	"""
	async with get_async_connection() as conn:
	query = """
	SELECT tb.text_with_nikud as text
	FROM talmud_bavli tb
	WHERE tb.tractate_chunk_id = $1
	"""
	result = await conn.fetchrow(query, tractate_chunk_id)
	return result["text"] if result else "Source text not found"

	def get_pg_sync_connection(schema="talmudexplore"):
	conn = psycopg2.connect(dbname=os.getenv("pg_dbname"),
	user=os.getenv("pg_user"),
	password=os.getenv("pg_password"),
	host=os.getenv("pg_host"),
	port=os.getenv("pg_port"),
	options=f"-c search_path={schema}")
	return conn