aquibmoin commited on
Commit
bdd0699
·
verified ·
1 Parent(s): 3bf3829

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -5
app.py CHANGED
@@ -81,12 +81,21 @@ def interpret_ragas_results_with_gpt(formatted_scores: list, llm) -> str:
81
  score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()])
82
 
83
  prompt = f"""
84
- You are an expert in RAGAS evaluation metrics to evaluate AI-generated content. Based on the following RAGAS evaluation scores, provide a concise interpretation of each of the metric for the evaluation of AI-generated text. Write in a professional, clear, and objective tone.
 
 
 
 
 
 
 
 
 
85
 
86
  RAGAS Scores:
87
  {score_text}
88
 
89
- Provide a paragraph-style interpretation.
90
  """
91
 
92
  response = llm.invoke(prompt)
@@ -107,7 +116,7 @@ def generate_word_report(science_goal, ragas_results, radar_chart_path, interpre
107
  doc.add_heading("RAGAS Metrics Chart", level=1)
108
  doc.add_picture(radar_chart_path, width=Inches(5))
109
 
110
- doc.add_heading("GPT Interpretation", level=1)
111
  doc.add_paragraph(interpretation)
112
 
113
  output_path = "SCDD_Evaluation_Report.docx"
@@ -169,9 +178,9 @@ interface = gr.Interface(
169
  gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."),
170
  ],
171
  outputs=[
172
- gr.JSON(label="RAGAS Scores"),
173
  gr.Image(label="RAGAS Metrics Radar Chart"),
174
- gr.Textbox(label="GPT Interpretation of RAGAS Results"),
175
  gr.File(label="Download Word Report")
176
  ],
177
  title="RAGAS Evaluation: AI vs Human SCDD",
 
81
  score_text = "\n".join([f"{k}: {v}" for k, v in formatted_scores[0].items()])
82
 
83
  prompt = f"""
84
+ You are an expert in RAGAS evaluation metrics to evaluate AI-generated content.
85
+
86
+ The following RAGAS evaluation scores are from a comparison between an AI-generated scientific case development document (SCDD) and a human-written version. This evaluation is conducted in the context of exploratory and novel scientific use cases — not strict academic summaries. The AI-generated document may include new ideas, restructured concepts, or facts not explicitly mentioned in the human reference.
87
+
88
+ When interpreting the metrics, adopt a constructive and exploratory perspective. In particular:
89
+
90
+ - **Lower factual correctness or accuracy scores or response groundedness scores** do not necessarily indicate factual errors. They may reflect the presence of new, valid information introduced by the AI that isn’t present in the human document.
91
+ - **Semantic similarity** and **faithfulness** may vary due to phrasing, abstraction, or granularity, and should be considered within the context of novelty and creativity.
92
+ - AI-generated document may be identifying gaps or elements missing from the human reference.
93
+ - Interpret each score clearly, explaining both strengths and areas where alignment may differ, without penalizing innovation or deeper insight.
94
 
95
  RAGAS Scores:
96
  {score_text}
97
 
98
+ Provide a short paragraph interpretation for each metric.
99
  """
100
 
101
  response = llm.invoke(prompt)
 
116
  doc.add_heading("RAGAS Metrics Chart", level=1)
117
  doc.add_picture(radar_chart_path, width=Inches(5))
118
 
119
+ doc.add_heading("GPT-4.1 Interpretation of RAGAS AI-SCDD Evaluation", level=1)
120
  doc.add_paragraph(interpretation)
121
 
122
  output_path = "SCDD_Evaluation_Report.docx"
 
178
  gr.Textbox(label="Science Goal", placeholder="Enter science goal here..."),
179
  ],
180
  outputs=[
181
+ gr.JSON(label="RAGAS Evaluation Scores"),
182
  gr.Image(label="RAGAS Metrics Radar Chart"),
183
+ gr.Textbox(label="GPT-4.1 Interpretation of RAGAS Results"),
184
  gr.File(label="Download Word Report")
185
  ],
186
  title="RAGAS Evaluation: AI vs Human SCDD",