QuizGenerator / app.py
MrSimple01's picture
Update app.py
88a62c0 verified
raw
history blame
11.8 kB
import re
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from langchain_google_genai import ChatGoogleGenerativeAI
import os
import gradio as gr
import time
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
def clean_text(text):
text = re.sub(r'\[speaker_\d+\]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def split_text_by_tokens(text, max_tokens=8000):
text = clean_text(text)
tokens = tokenizer.encode(text)
if len(tokens) <= max_tokens:
return [text]
split_point = len(tokens) // 2
sentences = re.split(r'(?<=[.!?])\s+', text)
first_half = []
second_half = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = len(tokenizer.encode(sentence))
if current_tokens + sentence_tokens <= split_point:
first_half.append(sentence)
current_tokens += sentence_tokens
else:
second_half.append(sentence)
return [" ".join(first_half), " ".join(second_half)]
def analyze_segment_with_gemini(segment_text):
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
temperature=0.7,
max_tokens=None,
timeout=None,
max_retries=3
)
prompt = f"""
Analyze the following text and identify distinct segments within it and do text segmentation:
1. Segments should be STRICTLY max=15
2. For each segment/topic you identify:
- Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
- List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
- Write a brief summary of that segment (3-5 sentences)
- Create 5 high-quality, meaningful quiz questions based DIRECTLY on the content in that segment only
- Questions and answers should be only from the content of the segment
For each quiz question:
- Create one correct answer that comes DIRECTLY from the text
- Create two plausible but incorrect answers
- IMPORTANT: Ensure all answer options have similar length (± 3 words)
- Ensure the correct answer is clearly indicated with a ✓ symbol
- Questions should **require actual understanding**, NOT just basic fact recall.
- Questions Are **non-trivial**, encourage deeper thinking, and **avoid surface-level facts**.
- Are **directly based on the segment's content** (not inferred from the summary).
- Do **not include questions about document structure** (e.g., title, number of paragraphs).
- Do **not generate overly generic or obvious questions** (e.g., "What is mentioned in the text?").
- Focus on **core ideas, logical reasoning, and conceptual understanding**.
ADDITIONAL REQUIREMENT:
- **First, detect the language of the original text.**
- **Generate ALL output (topic names, key concepts, summaries, and quizzes) in the same language as the original text.**
- If the text is in Russian, generate all responses in Russian.
- If the text is in another language, generate responses in that original language.
Text:
{segment_text}
Format your response as JSON with the following structure:
{{
"segments": [
{{
"topic_name": "Unique and Specific Topic Name",
"key_concepts": ["concept1", "concept2", "concept3"],
"summary": "Brief summary of this segment.",
"quiz_questions": [
{{
"question": "Question text?",
"options": [
{{
"text": "Option A",
"correct": false
}},
{{
"text": "Option B",
"correct": true
}},
{{
"text": "Option C",
"correct": false
}}
]
}}
]
}}
]
}}
IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
- **Do NOT repeat** key concepts across multiple segments unless absolutely necessary.
- **Ensure the quiz questions challenge the reader** and **are not easily guessable**.
"""
response = llm.invoke(prompt)
response_text = response.content
try:
json_match = re.search(r'\{[\s\S]*\}', response_text)
if json_match:
return json.loads(json_match.group(0))
else:
return json.loads(response_text)
except json.JSONDecodeError:
return {
"segments": [
{
"topic_name": "JSON Parsing Error",
"key_concepts": ["Error in response format"],
"summary": "Could not parse the API response.",
"quiz_questions": []
}
]
}
def process_document_with_quiz(text):
start_time = time.time()
token_count = len(tokenizer.encode(text))
print(f"[LOG] Total document tokens: {token_count}")
if token_count > 8000:
print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.")
parts = split_text_by_tokens(text)
print(f"[LOG] Document split into {len(parts)} parts")
for i, part in enumerate(parts):
part_tokens = len(tokenizer.encode(part))
print(f"[LOG] Part {i+1} contains {part_tokens} tokens")
else:
print(f"[LOG] Document under 8000 tokens. Processing as a single part.")
parts = [text]
all_segments = []
segment_counter = 1
for i, part in enumerate(parts):
part_start_time = time.time()
print(f"[LOG] Processing part {i+1}...")
analysis = analyze_segment_with_gemini(part)
if "segments" in analysis:
print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}")
for segment in analysis["segments"]:
segment["segment_number"] = segment_counter
all_segments.append(segment)
print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}")
segment_counter += 1
else:
# Fallback if response format is unexpected
print(f"[LOG] Error: Unexpected format in part {i+1} analysis")
fallback_segment = {
"topic_name": f"Segment {segment_counter} Analysis",
"key_concepts": ["Format error in analysis"],
"summary": "Could not properly segment this part of the text.",
"quiz_questions": [],
"segment_number": segment_counter
}
all_segments.append(fallback_segment)
print(f"[LOG] Added fallback segment {segment_counter}")
segment_counter += 1
part_time = time.time() - part_start_time
print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds")
total_time = time.time() - start_time
print(f"[LOG] Total processing time: {total_time:.2f} seconds")
print(f"[LOG] Generated {len(all_segments)} segments total")
return all_segments
def format_quiz_for_display(results):
output = []
for segment in results:
topic = segment["topic_name"]
segment_num = segment["segment_number"]
output.append(f"\n\n{'='*40}")
output.append(f"SEGMENT {segment_num}: {topic}")
output.append(f"{'='*40}\n")
output.append("KEY CONCEPTS:")
for concept in segment["key_concepts"]:
output.append(f"• {concept}")
output.append("\nSUMMARY:")
output.append(segment["summary"])
output.append("\nQUIZ QUESTIONS:")
for i, q in enumerate(segment["quiz_questions"]):
output.append(f"\n{i+1}. {q['question']}")
for j, option in enumerate(q['options']):
letter = chr(97 + j).upper()
correct_marker = " ✓" if option["correct"] else ""
output.append(f" {letter}. {option['text']}{correct_marker}")
return "\n".join(output)
def save_results_as_json(results, filename="analysis_results.json"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
return filename
def save_results_as_txt(formatted_text, filename="analysis_results.txt"):
with open(filename, "w", encoding="utf-8") as f:
f.write(formatted_text)
return filename
def analyze_document(document_text, api_key):
print(f"[LOG] Starting document analysis...")
overall_start_time = time.time()
os.environ["GOOGLE_API_KEY"] = api_key
try:
results = process_document_with_quiz(document_text)
formatted_output = format_quiz_for_display(results)
json_path = "analysis_results.json"
txt_path = "analysis_results.txt"
with open(json_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
with open(txt_path, "w", encoding="utf-8") as f:
f.write(formatted_output)
overall_time = time.time() - overall_start_time
print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds")
topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n"
topics_summary += f"Total segments: {len(results)}\n"
topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n"
topics_summary += "SEGMENTS:\n"
for segment in results:
topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n"
formatted_output = topics_summary + "\n" + formatted_output
return formatted_output, json_path, txt_path
except Exception as e:
error_msg = f"Error processing document: {str(e)}"
print(f"[LOG] ERROR: {error_msg}")
return error_msg, None, None
with gr.Blocks(title="Quiz Generator") as app:
gr.Markdown("# Quiz Generator")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Document Text",
placeholder="Paste your document text here...",
lines=10
)
api_key = gr.Textbox(
label="Gemini API Key",
placeholder="Enter your Gemini API key",
type="password"
)
analyze_btn = gr.Button("Analyze Document")
with gr.Column():
output_results = gr.Textbox(
label="Analysis Results",
lines=20
)
json_file_output = gr.File(label="Download JSON")
txt_file_output = gr.File(label="Download TXT")
analyze_btn.click(
fn=analyze_document,
inputs=[input_text, api_key],
outputs=[output_results, json_file_output, txt_file_output]
)
if __name__ == "__main__":
app.launch()