Spaces:

MrSimple01
/

QuizGenerator

Running

App Files Files Community

MrSimple01 commited on Mar 19

Commit

fadb17a

verified ·

1 Parent(s): e541939

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -329

app.py CHANGED Viewed

@@ -3,155 +3,47 @@ import numpy as np
 import json
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer
-from sklearn.cluster import AgglomerativeClustering
-from sklearn.metrics.pairwise import cosine_distances
 from langchain_google_genai import ChatGoogleGenerativeAI
 import os
 import gradio as gr
 tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
 sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
-max_tokens = 3000
 def clean_text(text):
     text = re.sub(r'\[speaker_\d+\]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-def split_text_with_modernbert_tokenizer(text):
-    text = clean_text(text)
-    rough_splits = re.split(r'(?<=[.!?])\s+', text)
-    segments = []
-    current_segment = ""
-    current_token_count = 0
-    for sentence in rough_splits:
-        if not sentence.strip():
-            continue
-        sentence_tokens = len(tokenizer.encode(sentence, add_special_tokens=False))
-        if (current_token_count + sentence_tokens > 100 or
-            re.search(r'[.!?]$', current_segment.strip())):
-            if current_segment:
-                segments.append(current_segment.strip())
-            current_segment = sentence
-            current_token_count = sentence_tokens
-        else:
-            current_segment += " " + sentence if current_segment else sentence
-            current_token_count += sentence_tokens
-    if current_segment:
-        segments.append(current_segment.strip())
-    refined_segments = []
-    for segment in segments:
-        if len(segment.split()) < 3:
-            if refined_segments:
-                refined_segments[-1] += ' ' + segment
-            else:
-                refined_segments.append(segment)
-            continue
-        tokens = tokenizer.tokenize(segment)
-        if len(tokens) < 50:
-            refined_segments.append(segment)
-            continue
-        break_indices = [i for i, token in enumerate(tokens)
-                        if ('.' in token or ',' in token or '?' in token or '!' in token)
-                        and i < len(tokens) - 1]
-        if not break_indices or break_indices[-1] < len(tokens) * 0.7:
-            refined_segments.append(segment)
-            continue
-        mid_idx = break_indices[len(break_indices) // 2]
-        first_half = tokenizer.convert_tokens_to_string(tokens[:mid_idx+1])
-        second_half = tokenizer.convert_tokens_to_string(tokens[mid_idx+1:])
-        refined_segments.append(first_half.strip())
-        refined_segments.append(second_half.strip())
-    return refined_segments
-def semantic_chunking(text):
-    segments = split_text_with_modernbert_tokenizer(text)
-    segment_embeddings = sentence_model.encode(segments)
-    distances = cosine_distances(segment_embeddings)
-    agg_clustering = AgglomerativeClustering(
-        n_clusters=None,
-        distance_threshold=1,
-        metric='precomputed',
-        linkage='average'
-    )
-    clusters = agg_clustering.fit_predict(distances)
-    # Group segments by cluster
-    cluster_groups = {}
-    for i, cluster_id in enumerate(clusters):
-        if cluster_id not in cluster_groups:
-            cluster_groups[cluster_id] = []
-        cluster_groups[cluster_id].append(segments[i])
-    chunks = []
-    for cluster_id in sorted(cluster_groups.keys()):
-        cluster_segments = cluster_groups[cluster_id]
-        current_chunk = []
-        current_token_count = 0
-        for segment in cluster_segments:
-            segment_tokens = len(tokenizer.encode(segment, truncation=True, add_special_tokens=True))
-            if segment_tokens > max_tokens:
-                if current_chunk:
-                    chunks.append(" ".join(current_chunk))
-                    current_chunk = []
-                    current_token_count = 0
-                chunks.append(segment)
-                continue
-            if current_token_count + segment_tokens > max_tokens and current_chunk:
-                chunks.append(" ".join(current_chunk))
-                current_chunk = [segment]
-                current_token_count = segment_tokens
-            else:
-                current_chunk.append(segment)
-                current_token_count += segment_tokens
-        if current_chunk:
-            chunks.append(" ".join(current_chunk))
-    if len(chunks) > 1:
-        chunk_embeddings = sentence_model.encode(chunks)
-        chunk_similarities = 1 - cosine_distances(chunk_embeddings)
-        i = 0
-        while i < len(chunks) - 1:
-            j = i + 1
-            if chunk_similarities[i, j] > 0.75:
-                combined = chunks[i] + " " + chunks[j]
-                combined_tokens = len(tokenizer.encode(combined, truncation=True, add_special_tokens=True))
-                if combined_tokens <= max_tokens:
-                    # Merge chunks
-                    chunks[i] = combined
-                    chunks.pop(j)
-                    chunk_embeddings = sentence_model.encode(chunks)
-                    chunk_similarities = 1 - cosine_distances(chunk_embeddings)
-                else:
-                    i += 1
-            else:
-                i += 1
-    return chunks
-def analyze_segment_with_gemini(cluster_text, is_full_text=False):
     llm = ChatGoogleGenerativeAI(
         model="gemini-1.5-flash",
         temperature=0.7,
@@ -159,240 +51,158 @@ def analyze_segment_with_gemini(cluster_text, is_full_text=False):
         timeout=None,
         max_retries=3
     )
-    if len(cluster_text.split()) < 50:
-        return {
-            "status": "insufficient",
-            "reason": f"Text is too short ({len(cluster_text.split())} words). Minimum 50 words required for analysis."
-        }
-    if is_full_text:
-        prompt = f"""
-            FIRST ASSESS THE TEXT:
-            - Check if it's primarily self-introduction, biographical information, or conclusion
-            - Check if it's too short or lacks meaningful content (less than 100 words of substance)
-            - If either case is true, respond with a simple JSON: {{"status": "insufficient", "reason": "Brief explanation"}}
-            Analyze the following text:
-            FIRST ASSESS THE TEXT:
-            - Is it primarily self-introduction, biographical information, or conclusion?
-            - Does it lack meaningful content for analysis?
-            IF THE TEXT IS INSUFFICIENT (introductory, concluding, or lacking substance):
-            Return ONLY this JSON structure:
-            {{
-                "status": "insufficient",
-                "reason": "Brief explanation (e.g., 'Text is primarily self-introduction', 'Text lacks substantive content')"
-            }}
-            IF THE TEXT HAS SUFFICIENT MEANINGFUL CONTENT:
-            1. First, do text segmentation and identify DISTINCT key topics within the text
-            2. For each segment/topic you identify:
-               - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
-               - List 3-5 key concepts discussed in that segment
-               - Write a brief summary of that segment (3-5 sentences)
-               - Create 5 quiz questions based DIRECTLY on the content in that segment
-            For each quiz question:
-            - Create one correct answer that comes DIRECTLY from the text
-            - Create two plausible but incorrect answers
-            - IMPORTANT: Ensure all answer options have similar length (± 3 words)
-            - Ensure the correct answer is clearly indicated
-            Text:
-            {cluster_text}
-            Format your response as JSON with the following structure:
-            {{
-                "segments": [
-                    {{
-                        "topic_name": "Name of segment 1",
-                        "key_concepts": ["concept1", "concept2", "concept3"],
-                        "summary": "Brief summary of this segment.",
-                        "quiz_questions": [
-                            {{
-                                "question": "Question text?",
-                                "options": [
-                                    {{
-                                        "text": "Option A",
-                                        "correct": false
-                                    }},
-                                    {{
-                                        "text": "Option B",
-                                        "correct": true
-                                    }},
-                                    {{
-                                        "text": "Option C",
-                                        "correct": false
-                                    }}
-                                ]
-                            }},
-                            // More questions...
-                        ]
-                    }},
-                    // More segments...
-                ]
-            }}
-        """
-    else:
-        prompt = f"""
-            Analyze the following text segment and provide:
-            FIRST ASSESS THE TEXT:
-            - Is it primarily self-introduction, biographical information, or conclusion?
-            - Does it lack meaningful content for analysis?
-            IF THE TEXT IS INSUFFICIENT (introductory, concluding, or lacking substance):
-            Return ONLY this JSON structure:
-            {{
-                "status": "insufficient",
-                "reason": "Brief explanation (e.g., 'Text is primarily self-introduction', 'Text lacks substantive content')"
-            }}
-            IF THE TEXT HAS SUFFICIENT MEANINGFUL CONTENT:
-            1. A SPECIFIC and DESCRIPTIVE topic name (3-5 words) that precisely captures the main focus
-            2. 3-5 key concepts discussed
-            3. A brief summary (6-7 sentences)
-            4. Create 5 quiz questions based DIRECTLY on the text content (not from your summary)
-            For each quiz question:
-            - Create one correct answer that comes DIRECTLY from the text
-            - Create two plausible but incorrect answers
-            - IMPORTANT and STRICTLY: Ensure all answer options have similar length (± 3 words)
-            - Ensure the correct answer is clearly indicated
-            Text segment:
-            {cluster_text}
-            Format your response as JSON with the following structure:
-            {{
-                "topic_name": "Name of the topic",
-                "key_concepts": ["concept1", "concept2", "concept3"],
-                "summary": "Brief summary of the text segment.",
-                "quiz_questions": [
-                    {{
-                        "question": "Question text?",
-                        "options": [
-                            {{
-                                "text": "Option A",
-                                "correct": false
-                            }},
-                            {{
-                                "text": "Option B",
-                                "correct": true
-                            }},
-                            {{
-                                "text": "Option C",
-                                "correct": false
-                            }}
-                        ]
-                    }},
-                    // More questions...
-                ]
-            }}
-        """
-    response = llm.invoke(prompt)
     response_text = response.content
     try:
         json_match = re.search(r'\{[\s\S]*\}', response_text)
         if json_match:
-            response_json = json.loads(json_match.group(0))
-        else:
-            response_json = json.loads(response_text)
-        return response_json
-    except json.JSONDecodeError as e:
-        print(f"Error parsing JSON response: {e}")
-        print(f"Raw response: {response_text}")
-        if is_full_text:
-            return {
-                "segments": [
-                    {
-                        "topic_name": "JSON Parsing Error",
-                        "key_concepts": ["Error in response format"],
-                        "summary": f"Could not parse the API response. Raw text: {response_text[:200]}...",
-                        "quiz_questions": []
-                    }
-                ]
-            }
         else:
-            return {
-                "topic_name": "JSON Parsing Error",
-                "key_concepts": ["Error in response format"],
-                "summary": f"Could not parse the API response. Raw text: {response_text[:200]}...",
-                "quiz_questions": []
-            }
 def process_document_with_quiz(text):
     token_count = len(tokenizer.encode(text))
-    print(f"Text contains {token_count} tokens")
-    if token_count < 8000:
-        print("Text is short enough to analyze directly without text segmentation")
-        full_analysis = analyze_segment_with_gemini(text, is_full_text=True)
-        results = []
-        if "segments" in full_analysis:
-            for i, segment in enumerate(full_analysis["segments"]):
-                segment["segment_number"] = i + 1
-                segment["segment_text"] = "Segment identified by Gemini"
-                results.append(segment)
-            print(f"Gemini identified {len(results)} segments in the text")
-        else:
-            print("Unexpected response format from Gemini")
-            results = [full_analysis]
-        return results
-    chunks = semantic_chunking(text)
-    print(f"{len(chunks)} semantic chunks were found\n")
-    results = []
-    for i, chunk in enumerate(chunks):
-        print(f"Analyzing segment {i+1}/{len(chunks)}...")
-        analysis = analyze_segment_with_gemini(chunk, is_full_text=False)
-        analysis["segment_number"] = i + 1
-        analysis["segment_text"] = chunk
-        results.append(analysis)
-        print(f"Completed analysis of segment {i+1}: {analysis['topic_name']}")
-    return results
-def save_results_to_file(results, output_file="analysis_results.json"):
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(results, f, indent=2, ensure_ascii=False)
-    print(f"Results saved to {output_file}")
 def format_quiz_for_display(results):
     output = []
-    for segment_result in results:
-        segment_num = segment_result["segment_number"]
-        topic = segment_result["topic_name"]
         output.append(f"\n\n{'='*40}")
         output.append(f"SEGMENT {segment_num}: {topic}")
         output.append(f"{'='*40}\n")
         output.append("KEY CONCEPTS:")
-        for concept in segment_result["key_concepts"]:
             output.append(f"• {concept}")
         output.append("\nSUMMARY:")
-        output.append(segment_result["summary"])
         output.append("\nQUIZ QUESTIONS:")
-        for i, q in enumerate(segment_result["quiz_questions"]):
             output.append(f"\n{i+1}. {q['question']}")
             for j, option in enumerate(q['options']):
@@ -402,22 +212,52 @@ def format_quiz_for_display(results):
     return "\n".join(output)
-def analyze_document(document_text: str, api_key: str) -> tuple:
     os.environ["GOOGLE_API_KEY"] = api_key
     try:
         results = process_document_with_quiz(document_text)
         formatted_output = format_quiz_for_display(results)
         json_path = "analysis_results.json"
         txt_path = "analysis_results.txt"
         with open(json_path, "w", encoding="utf-8") as f:
             json.dump(results, f, indent=2, ensure_ascii=False)
         with open(txt_path, "w", encoding="utf-8") as f:
             f.write(formatted_output)
         return formatted_output, json_path, txt_path
     except Exception as e:
         error_msg = f"Error processing document: {str(e)}"
         return error_msg, None, None
 with gr.Blocks(title="Quiz Generator") as app:
@@ -426,17 +266,19 @@ with gr.Blocks(title="Quiz Generator") as app:
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(
-                label="Input Text",
                 placeholder="Paste your document text here...",
                 lines=10
             )
             api_key = gr.Textbox(
                 label="Gemini API Key",
                 placeholder="Enter your Gemini API key",
                 type="password"
             )
             analyze_btn = gr.Button("Analyze Document")
         with gr.Column():
             output_results = gr.Textbox(
                 label="Analysis Results",

 import json
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer
 from langchain_google_genai import ChatGoogleGenerativeAI
 import os
 import gradio as gr
+import time
 tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
 sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
 def clean_text(text):
     text = re.sub(r'\[speaker_\d+\]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+def split_text_by_tokens(text, max_tokens=8000):
+    text = clean_text(text)
+    tokens = tokenizer.encode(text)
+    if len(tokens) <= max_tokens:
+        return [text]
+    split_point = len(tokens) // 2
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    first_half = []
+    second_half = []
+    current_tokens = 0
+    for sentence in sentences:
+        sentence_tokens = len(tokenizer.encode(sentence))
+        if current_tokens + sentence_tokens <= split_point:
+            first_half.append(sentence)
+            current_tokens += sentence_tokens
+        else:
+            second_half.append(sentence)
+    return [" ".join(first_half), " ".join(second_half)]
+def analyze_segment_with_gemini(segment_text):
     llm = ChatGoogleGenerativeAI(
         model="gemini-1.5-flash",
         temperature=0.7,
         timeout=None,
         max_retries=3
     )
+    prompt = f"""
+        Analyze the following text and identify distinct segments within it and do text segmentation:
+        1. Segments should be STRICTLY max=10
+        2. For each segment/topic you identify:
+           - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
+           - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
+           - Write a brief summary of that segment (3-5 sentences)
+           - Create 5 quiz questions based DIRECTLY on the content in that segment only
+        For each quiz question:
+        - Create one correct answer that comes DIRECTLY from the text
+        - Create two plausible but incorrect answers
+        - IMPORTANT: Ensure all answer options have similar length (± 3 words)
+        - Ensure the correct answer is clearly indicated with a ✓ symbol
+        Text:
+        {segment_text}
+        Format your response as JSON with the following structure:
+        {{
+            "segments": [
+                {{
+                    "topic_name": "Unique and Specific Topic Name",
+                    "key_concepts": ["concept1", "concept2", "concept3"],
+                    "summary": "Brief summary of this segment.",
+                    "quiz_questions": [
+                        {{
+                            "question": "Question text?",
+                            "options": [
+                                {{
+                                    "text": "Option A",
+                                    "correct": false
+                                }},
+                                {{
+                                    "text": "Option B",
+                                    "correct": true
+                                }},
+                                {{
+                                    "text": "Option C",
+                                    "correct": false
+                                }}
+                            ]
+                        }}
+                    ]
+                }}
+            ]
+        }}
+        IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
+    """
+    response = llm.invoke(prompt)
     response_text = response.content
     try:
         json_match = re.search(r'\{[\s\S]*\}', response_text)
         if json_match:
+            return json.loads(json_match.group(0))
         else:
+            return json.loads(response_text)
+    except json.JSONDecodeError:
+        return {
+            "segments": [
+                {
+                    "topic_name": "JSON Parsing Error",
+                    "key_concepts": ["Error in response format"],
+                    "summary": "Could not parse the API response.",
+                    "quiz_questions": []
+                }
+            ]
+        }
 def process_document_with_quiz(text):
+    start_time = time.time()
     token_count = len(tokenizer.encode(text))
+    print(f"[LOG] Total document tokens: {token_count}")
+    if token_count > 8000:
+        print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.")
+        parts = split_text_by_tokens(text)
+        print(f"[LOG] Document split into {len(parts)} parts")
+        for i, part in enumerate(parts):
+            part_tokens = len(tokenizer.encode(part))
+            print(f"[LOG] Part {i+1} contains {part_tokens} tokens")
+    else:
+        print(f"[LOG] Document under 8000 tokens. Processing as a single part.")
+        parts = [text]
+    all_segments = []
+    segment_counter = 1
+    for i, part in enumerate(parts):
+        part_start_time = time.time()
+        print(f"[LOG] Processing part {i+1}...")
+        analysis = analyze_segment_with_gemini(part)
+        if "segments" in analysis:
+            print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}")
+            for segment in analysis["segments"]:
+                segment["segment_number"] = segment_counter
+                all_segments.append(segment)
+                print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}")
+                segment_counter += 1
+        else:
+            # Fallback if response format is unexpected
+            print(f"[LOG] Error: Unexpected format in part {i+1} analysis")
+            fallback_segment = {
+                "topic_name": f"Segment {segment_counter} Analysis",
+                "key_concepts": ["Format error in analysis"],
+                "summary": "Could not properly segment this part of the text.",
+                "quiz_questions": [],
+                "segment_number": segment_counter
+            }
+            all_segments.append(fallback_segment)
+            print(f"[LOG] Added fallback segment {segment_counter}")
+            segment_counter += 1
+        part_time = time.time() - part_start_time
+        print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds")
+    total_time = time.time() - start_time
+    print(f"[LOG] Total processing time: {total_time:.2f} seconds")
+    print(f"[LOG] Generated {len(all_segments)} segments total")
+    return all_segments
 def format_quiz_for_display(results):
     output = []
+    for segment in results:
+        topic = segment["topic_name"]
+        segment_num = segment["segment_number"]
         output.append(f"\n\n{'='*40}")
         output.append(f"SEGMENT {segment_num}: {topic}")
         output.append(f"{'='*40}\n")
         output.append("KEY CONCEPTS:")
+        for concept in segment["key_concepts"]:
             output.append(f"• {concept}")
         output.append("\nSUMMARY:")
+        output.append(segment["summary"])
         output.append("\nQUIZ QUESTIONS:")
+        for i, q in enumerate(segment["quiz_questions"]):
             output.append(f"\n{i+1}. {q['question']}")
             for j, option in enumerate(q['options']):
     return "\n".join(output)
+def save_results_as_json(results, filename="analysis_results.json"):
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    return filename
+def save_results_as_txt(formatted_text, filename="analysis_results.txt"):
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(formatted_text)
+    return filename
+def analyze_document(document_text, api_key):
+    print(f"[LOG] Starting document analysis...")
+    overall_start_time = time.time()
     os.environ["GOOGLE_API_KEY"] = api_key
     try:
         results = process_document_with_quiz(document_text)
         formatted_output = format_quiz_for_display(results)
         json_path = "analysis_results.json"
         txt_path = "analysis_results.txt"
         with open(json_path, "w", encoding="utf-8") as f:
             json.dump(results, f, indent=2, ensure_ascii=False)
         with open(txt_path, "w", encoding="utf-8") as f:
             f.write(formatted_output)
+        overall_time = time.time() - overall_start_time
+        print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds")
+        topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n"
+        topics_summary += f"Total segments: {len(results)}\n"
+        topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n"
+        topics_summary += "SEGMENTS:\n"
+        for segment in results:
+            topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n"
+        formatted_output = topics_summary + "\n" + formatted_output
         return formatted_output, json_path, txt_path
     except Exception as e:
         error_msg = f"Error processing document: {str(e)}"
+        print(f"[LOG] ERROR: {error_msg}")
         return error_msg, None, None
 with gr.Blocks(title="Quiz Generator") as app:
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(
+                label="Input Document Text",
                 placeholder="Paste your document text here...",
                 lines=10
             )
             api_key = gr.Textbox(
                 label="Gemini API Key",
                 placeholder="Enter your Gemini API key",
                 type="password"
             )
             analyze_btn = gr.Button("Analyze Document")
         with gr.Column():
             output_results = gr.Textbox(
                 label="Analysis Results",