arjunanand13 commited on
Commit
e1175ed
1 Parent(s): ababf21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -40
app.py CHANGED
@@ -47,38 +47,60 @@ class RAGEvaluator:
47
  self.current_dataset = None
48
  self.test_samples = []
49
 
50
- def load_dataset(self, dataset_name: str, num_samples: int = 5):
51
- """Load a smaller subset of questions"""
52
- if dataset_name == "squad":
53
- dataset = load_dataset("squad_v2", split="validation")
54
- # Select diverse questions based on length and type
55
- samples = dataset.select(range(0, 1000, 100))[:num_samples] # Take 10 spaced-out samples
56
- self.test_samples = [
57
- {
58
- "question": sample["question"],
59
- "ground_truth": sample["answers"]["text"][0] if sample["answers"]["text"] else "",
60
- "context": sample["context"]
61
- }
62
- for sample in samples
63
- if sample["answers"]["text"]
64
- ]
65
- elif dataset_name == "msmarco":
66
- dataset = load_dataset("ms_marco", "v2.1", split="train")
67
- samples = dataset.select(range(0, 1000, 100))[:num_samples]
68
- self.test_samples = [
69
- {
70
- "question": sample["query"],
71
- "ground_truth": sample["answers"][0] if sample["answers"] else "",
72
- "context": sample["passages"]["passage_text"][0]
73
- }
74
- for sample in samples
75
- if sample["answers"]
76
- ]
77
- self.current_dataset = dataset_name
78
- return self.test_samples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict:
81
- """Evaluate with progress tracking"""
82
  if not self.test_samples:
83
  return {"error": "No dataset loaded"}
84
 
@@ -105,11 +127,17 @@ class RAGEvaluator:
105
  print(f"Error processing question {i+1}: {str(e)}")
106
  continue
107
 
108
- # Calculate RAGAS metrics
109
- eval_dataset = Dataset.from_list(results)
110
- metrics = [ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision()]
111
-
 
 
 
112
  try:
 
 
 
113
  scores = evaluate(eval_dataset, metrics=metrics)
114
 
115
  return {
@@ -417,12 +445,25 @@ def demo():
417
  )
418
 
419
  def load_dataset_handler(dataset_name):
420
- samples = evaluator.load_dataset(dataset_name)
421
- return {
422
- "dataset": dataset_name,
423
- "num_samples": len(samples),
424
- "sample_questions": [s["question"] for s in samples[:3]]
425
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
  def run_evaluation(dataset_choice, splitting_strategy, chunk_size, vector_db, qa_chain):
428
  if not evaluator.current_dataset:
 
47
  self.current_dataset = None
48
  self.test_samples = []
49
 
50
+ def load_dataset(self, dataset_name: str, num_samples: int = 10):
51
+ """Load a smaller subset of questions with proper error handling"""
52
+ try:
53
+ if dataset_name == "squad":
54
+ dataset = load_dataset("squad_v2", split="validation")
55
+ # Select diverse questions
56
+ samples = dataset.select(range(0, 1000, 100))[:num_samples]
57
+
58
+ self.test_samples = []
59
+ for sample in samples:
60
+ # Check if answers exist and are not empty
61
+ if sample.get("answers") and isinstance(sample["answers"], dict) and sample["answers"].get("text"):
62
+ self.test_samples.append({
63
+ "question": sample["question"],
64
+ "ground_truth": sample["answers"]["text"][0],
65
+ "context": sample["context"]
66
+ })
67
+
68
+ elif dataset_name == "msmarco":
69
+ dataset = load_dataset("ms_marco", "v2.1", split="dev")
70
+ samples = dataset.select(range(0, 1000, 100))[:num_samples]
71
+
72
+ self.test_samples = []
73
+ for sample in samples:
74
+ # Check for valid answers
75
+ if sample.get("answers") and sample["answers"]:
76
+ self.test_samples.append({
77
+ "question": sample["query"],
78
+ "ground_truth": sample["answers"][0],
79
+ "context": sample["passages"][0]["passage_text"]
80
+ if isinstance(sample["passages"], list)
81
+ else sample["passages"]["passage_text"][0]
82
+ })
83
+
84
+ self.current_dataset = dataset_name
85
+
86
+ # Return dataset info
87
+ return {
88
+ "dataset": dataset_name,
89
+ "num_samples": len(self.test_samples),
90
+ "sample_questions": [s["question"] for s in self.test_samples[:3]],
91
+ "status": "success"
92
+ }
93
+
94
+ except Exception as e:
95
+ print(f"Error loading dataset: {str(e)}")
96
+ return {
97
+ "dataset": dataset_name,
98
+ "error": str(e),
99
+ "status": "failed"
100
+ }
101
 
102
  def evaluate_configuration(self, vector_db, qa_chain, splitting_strategy: str, chunk_size: str) -> Dict:
103
+ """Evaluate with progress tracking and error handling"""
104
  if not self.test_samples:
105
  return {"error": "No dataset loaded"}
106
 
 
127
  print(f"Error processing question {i+1}: {str(e)}")
128
  continue
129
 
130
+ if not results:
131
+ return {
132
+ "configuration": f"{splitting_strategy}_{chunk_size}",
133
+ "error": "No successful evaluations",
134
+ "questions_evaluated": 0
135
+ }
136
+
137
  try:
138
+ # Calculate RAGAS metrics
139
+ eval_dataset = Dataset.from_list(results)
140
+ metrics = [ContextRecall(), AnswerRelevancy(), Faithfulness(), ContextPrecision()]
141
  scores = evaluate(eval_dataset, metrics=metrics)
142
 
143
  return {
 
445
  )
446
 
447
  def load_dataset_handler(dataset_name):
448
+ try:
449
+ result = evaluator.load_dataset(dataset_name)
450
+ if result.get("status") == "success":
451
+ return {
452
+ "dataset": result["dataset"],
453
+ "samples_loaded": result["num_samples"],
454
+ "example_questions": result["sample_questions"],
455
+ "status": "ready for evaluation"
456
+ }
457
+ else:
458
+ return {
459
+ "error": result.get("error", "Unknown error occurred"),
460
+ "status": "failed to load dataset"
461
+ }
462
+ except Exception as e:
463
+ return {
464
+ "error": str(e),
465
+ "status": "failed to load dataset"
466
+ }
467
 
468
  def run_evaluation(dataset_choice, splitting_strategy, chunk_size, vector_db, qa_chain):
469
  if not evaluator.current_dataset: