Spaces:
Sleeping
Sleeping
Serhan Yılmaz
commited on
Commit
·
c1ff2ef
1
Parent(s):
ccde14a
init
Browse files- app.py +158 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import json
|
4 |
+
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
+
from datasets import load_dataset
|
7 |
+
import random
|
8 |
+
from openai import OpenAI
|
9 |
+
from typing import List, Tuple, Dict
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from transformers import pipeline
|
12 |
+
import asyncio
|
13 |
+
|
14 |
+
# Import the required functions from the pipeline file
|
15 |
+
from pipeline_gradio_experimental import generate_basic_question, rank_questions_with_details
|
16 |
+
|
17 |
+
# Set up logging
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
# Load environment variables
|
22 |
+
load_dotenv()
|
23 |
+
|
24 |
+
# Initialize OpenAI client
|
25 |
+
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
26 |
+
|
27 |
+
# Load the SQuAD dataset
|
28 |
+
dataset = load_dataset("squad")
|
29 |
+
|
30 |
+
# Initialize the question answering pipeline
|
31 |
+
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
|
32 |
+
|
33 |
+
def get_random_entry():
|
34 |
+
random_index = random.randint(0, len(dataset['train']) - 1)
|
35 |
+
entry = dataset['train'][random_index]
|
36 |
+
return entry['context'], entry['answers']['text'][0], entry['question']
|
37 |
+
|
38 |
+
def generate_answer(context: str, question: str) -> str:
|
39 |
+
try:
|
40 |
+
result = qa_pipeline(question=question, context=context)
|
41 |
+
return result['answer']
|
42 |
+
except Exception as e:
|
43 |
+
logger.error(f"Error in generate_answer: {e}")
|
44 |
+
return "Failed to generate answer"
|
45 |
+
|
46 |
+
def compare_questions(context: str, original_answer: str, question1: str, answer1: str, question2: str, answer2: str) -> Dict[str, any]:
|
47 |
+
try:
|
48 |
+
response = client.chat.completions.create(
|
49 |
+
model="gpt-4o-2024-08-06",
|
50 |
+
messages=[
|
51 |
+
{"role": "system", "content": "You are an expert in evaluating question-answer pairs based on a given context."},
|
52 |
+
{"role": "user", "content": f"""Compare the following two question-answer pairs based on the given context and original answer. Evaluate their quality and relevance.
|
53 |
+
|
54 |
+
Context: {context}
|
55 |
+
Original Answer: {original_answer}
|
56 |
+
|
57 |
+
Question 1: {question1}
|
58 |
+
Answer 1: {answer1}
|
59 |
+
|
60 |
+
Question 2: {question2}
|
61 |
+
Answer 2: {answer2}
|
62 |
+
|
63 |
+
Score each question-answer pair on a scale of 0 to 10 based on the quality and relevance of the question and answer. Provide an explanation for your evaluation. Focus on how well the new answer matches the old answer considering the context. Make sure to grade one higher than the other."""}
|
64 |
+
],
|
65 |
+
response_format={
|
66 |
+
"type": "json_schema",
|
67 |
+
"json_schema": {
|
68 |
+
"name": "question_comparison_evaluator",
|
69 |
+
"strict": True,
|
70 |
+
"schema": {
|
71 |
+
"type": "object",
|
72 |
+
"properties": {
|
73 |
+
"question1_score": {"type": "number"},
|
74 |
+
"question2_score": {"type": "number"},
|
75 |
+
"explanation": {"type": "string"}
|
76 |
+
},
|
77 |
+
"required": ["question1_score", "question2_score", "explanation"],
|
78 |
+
"additionalProperties": False
|
79 |
+
}
|
80 |
+
}
|
81 |
+
}
|
82 |
+
)
|
83 |
+
return json.loads(response.choices[0].message.content)
|
84 |
+
except Exception as e:
|
85 |
+
logger.error(f"Error in comparing questions: {e}")
|
86 |
+
return {"question1_score": 0, "question2_score": 0, "explanation": "Failed to compare questions"}
|
87 |
+
|
88 |
+
async def process_random_entry(progress=gr.Progress()):
|
89 |
+
context, original_answer, original_question = get_random_entry()
|
90 |
+
|
91 |
+
# Yield the original context, question, and answer immediately
|
92 |
+
yield context, original_question, original_answer, gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
93 |
+
|
94 |
+
# Simulate some processing time
|
95 |
+
await asyncio.sleep(1)
|
96 |
+
progress(0.3, desc="Generating questions...")
|
97 |
+
|
98 |
+
basic_question = generate_basic_question(context, original_answer)
|
99 |
+
_, _, enhanced_question = rank_questions_with_details(context, original_answer)
|
100 |
+
|
101 |
+
await asyncio.sleep(1)
|
102 |
+
progress(0.6, desc="Generating answers...")
|
103 |
+
|
104 |
+
basic_answer = generate_answer(context, basic_question)
|
105 |
+
enhanced_answer = generate_answer(context, enhanced_question)
|
106 |
+
|
107 |
+
await asyncio.sleep(1)
|
108 |
+
progress(0.9, desc="Comparing questions...")
|
109 |
+
|
110 |
+
comparison_result = compare_questions(context, original_answer, basic_question, basic_answer, enhanced_question, enhanced_answer)
|
111 |
+
|
112 |
+
winner = "Basic" if comparison_result["question1_score"] > comparison_result["question2_score"] else "Enhanced"
|
113 |
+
|
114 |
+
# Yield the final results
|
115 |
+
yield (
|
116 |
+
context,
|
117 |
+
original_question,
|
118 |
+
original_answer,
|
119 |
+
gr.update(visible=True),
|
120 |
+
gr.update(visible=True, value=f"Question: {basic_question}\nAnswer: {basic_answer}"),
|
121 |
+
gr.update(visible=True, value=f"Question: {enhanced_question}\nAnswer: {enhanced_answer}"),
|
122 |
+
gr.update(visible=True, value=f"Question 1 Score: {comparison_result['question1_score']}\n"
|
123 |
+
f"Question 2 Score: {comparison_result['question2_score']}\n"
|
124 |
+
f"Explanation: {comparison_result['explanation']}\n"
|
125 |
+
f"Winner: {winner} Generation")
|
126 |
+
)
|
127 |
+
|
128 |
+
# Create Gradio interface
|
129 |
+
with gr.Blocks(theme=gr.themes.Default()) as iface:
|
130 |
+
gr.Markdown("# Question Generation and Comparison")
|
131 |
+
gr.Markdown("Click the button to get a random entry from the SQuAD dataset and compare basic and enhanced question generation.")
|
132 |
+
|
133 |
+
random_button = gr.Button("Get Random Question")
|
134 |
+
|
135 |
+
with gr.Column(visible=False) as output_column:
|
136 |
+
context_output = gr.Textbox(label="Original Context")
|
137 |
+
original_question_output = gr.Textbox(label="Original Question")
|
138 |
+
original_answer_output = gr.Textbox(label="Original Answer")
|
139 |
+
basic_generation_output = gr.Textbox(label="Basic Generation", visible=False)
|
140 |
+
enhanced_generation_output = gr.Textbox(label="Enhanced Generation", visible=False)
|
141 |
+
comparison_result_output = gr.Textbox(label="Comparison Result", visible=False)
|
142 |
+
|
143 |
+
random_button.click(
|
144 |
+
fn=process_random_entry,
|
145 |
+
outputs=[
|
146 |
+
context_output,
|
147 |
+
original_question_output,
|
148 |
+
original_answer_output,
|
149 |
+
output_column,
|
150 |
+
basic_generation_output,
|
151 |
+
enhanced_generation_output,
|
152 |
+
comparison_result_output
|
153 |
+
]
|
154 |
+
)
|
155 |
+
|
156 |
+
# Launch the app
|
157 |
+
if __name__ == "__main__":
|
158 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
openai
|
3 |
+
numpy
|
4 |
+
sentence-transformers
|
5 |
+
transformers
|
6 |
+
python-dotenv
|
7 |
+
pandas
|