MrSimple01 commited on
Commit
fadb17a
·
verified ·
1 Parent(s): e541939

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -329
app.py CHANGED
@@ -3,155 +3,47 @@ import numpy as np
3
  import json
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import AutoTokenizer
6
- from sklearn.cluster import AgglomerativeClustering
7
- from sklearn.metrics.pairwise import cosine_distances
8
  from langchain_google_genai import ChatGoogleGenerativeAI
9
  import os
10
  import gradio as gr
 
11
 
12
 
13
  tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
14
  sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
15
- max_tokens = 3000
16
 
17
  def clean_text(text):
18
  text = re.sub(r'\[speaker_\d+\]', '', text)
19
  text = re.sub(r'\s+', ' ', text).strip()
20
  return text
21
 
22
- def split_text_with_modernbert_tokenizer(text):
23
- text = clean_text(text)
24
- rough_splits = re.split(r'(?<=[.!?])\s+', text)
25
 
26
- segments = []
27
- current_segment = ""
28
- current_token_count = 0
29
 
30
- for sentence in rough_splits:
31
- if not sentence.strip():
32
- continue
33
-
34
- sentence_tokens = len(tokenizer.encode(sentence, add_special_tokens=False))
35
- if (current_token_count + sentence_tokens > 100 or
36
- re.search(r'[.!?]$', current_segment.strip())):
37
- if current_segment:
38
- segments.append(current_segment.strip())
39
- current_segment = sentence
40
- current_token_count = sentence_tokens
41
- else:
42
- current_segment += " " + sentence if current_segment else sentence
43
- current_token_count += sentence_tokens
44
-
45
- if current_segment:
46
- segments.append(current_segment.strip())
47
 
48
- refined_segments = []
49
-
50
- for segment in segments:
51
- if len(segment.split()) < 3:
52
- if refined_segments:
53
- refined_segments[-1] += ' ' + segment
54
- else:
55
- refined_segments.append(segment)
56
- continue
57
-
58
- tokens = tokenizer.tokenize(segment)
59
-
60
- if len(tokens) < 50:
61
- refined_segments.append(segment)
62
- continue
63
-
64
- break_indices = [i for i, token in enumerate(tokens)
65
- if ('.' in token or ',' in token or '?' in token or '!' in token)
66
- and i < len(tokens) - 1]
67
-
68
- if not break_indices or break_indices[-1] < len(tokens) * 0.7:
69
- refined_segments.append(segment)
70
- continue
71
-
72
- mid_idx = break_indices[len(break_indices) // 2]
73
- first_half = tokenizer.convert_tokens_to_string(tokens[:mid_idx+1])
74
- second_half = tokenizer.convert_tokens_to_string(tokens[mid_idx+1:])
75
-
76
- refined_segments.append(first_half.strip())
77
- refined_segments.append(second_half.strip())
78
 
79
- return refined_segments
80
-
81
- def semantic_chunking(text):
82
- segments = split_text_with_modernbert_tokenizer(text)
83
- segment_embeddings = sentence_model.encode(segments)
84
-
85
- distances = cosine_distances(segment_embeddings)
86
-
87
- agg_clustering = AgglomerativeClustering(
88
- n_clusters=None,
89
- distance_threshold=1,
90
- metric='precomputed',
91
- linkage='average'
92
- )
93
- clusters = agg_clustering.fit_predict(distances)
94
-
95
- # Group segments by cluster
96
- cluster_groups = {}
97
- for i, cluster_id in enumerate(clusters):
98
- if cluster_id not in cluster_groups:
99
- cluster_groups[cluster_id] = []
100
- cluster_groups[cluster_id].append(segments[i])
101
-
102
- chunks = []
103
- for cluster_id in sorted(cluster_groups.keys()):
104
- cluster_segments = cluster_groups[cluster_id]
105
-
106
- current_chunk = []
107
- current_token_count = 0
108
-
109
- for segment in cluster_segments:
110
- segment_tokens = len(tokenizer.encode(segment, truncation=True, add_special_tokens=True))
111
- if segment_tokens > max_tokens:
112
- if current_chunk:
113
- chunks.append(" ".join(current_chunk))
114
- current_chunk = []
115
- current_token_count = 0
116
- chunks.append(segment)
117
- continue
118
-
119
- if current_token_count + segment_tokens > max_tokens and current_chunk:
120
- chunks.append(" ".join(current_chunk))
121
- current_chunk = [segment]
122
- current_token_count = segment_tokens
123
- else:
124
- current_chunk.append(segment)
125
- current_token_count += segment_tokens
126
-
127
- if current_chunk:
128
- chunks.append(" ".join(current_chunk))
129
 
130
- if len(chunks) > 1:
131
- chunk_embeddings = sentence_model.encode(chunks)
132
- chunk_similarities = 1 - cosine_distances(chunk_embeddings)
133
 
134
- i = 0
135
- while i < len(chunks) - 1:
136
- j = i + 1
137
- if chunk_similarities[i, j] > 0.75:
138
- combined = chunks[i] + " " + chunks[j]
139
- combined_tokens = len(tokenizer.encode(combined, truncation=True, add_special_tokens=True))
140
-
141
- if combined_tokens <= max_tokens:
142
- # Merge chunks
143
- chunks[i] = combined
144
- chunks.pop(j)
145
- chunk_embeddings = sentence_model.encode(chunks)
146
- chunk_similarities = 1 - cosine_distances(chunk_embeddings)
147
- else:
148
- i += 1
149
- else:
150
- i += 1
151
 
152
- return chunks
153
 
154
- def analyze_segment_with_gemini(cluster_text, is_full_text=False):
155
  llm = ChatGoogleGenerativeAI(
156
  model="gemini-1.5-flash",
157
  temperature=0.7,
@@ -159,240 +51,158 @@ def analyze_segment_with_gemini(cluster_text, is_full_text=False):
159
  timeout=None,
160
  max_retries=3
161
  )
162
-
163
- if len(cluster_text.split()) < 50:
164
- return {
165
- "status": "insufficient",
166
- "reason": f"Text is too short ({len(cluster_text.split())} words). Minimum 50 words required for analysis."
167
- }
168
 
169
- if is_full_text:
170
- prompt = f"""
171
- FIRST ASSESS THE TEXT:
172
- - Check if it's primarily self-introduction, biographical information, or conclusion
173
- - Check if it's too short or lacks meaningful content (less than 100 words of substance)
174
- - If either case is true, respond with a simple JSON: {{"status": "insufficient", "reason": "Brief explanation"}}
 
 
175
 
176
- Analyze the following text:
177
- FIRST ASSESS THE TEXT:
178
- - Is it primarily self-introduction, biographical information, or conclusion?
179
- - Does it lack meaningful content for analysis?
180
-
181
- IF THE TEXT IS INSUFFICIENT (introductory, concluding, or lacking substance):
182
- Return ONLY this JSON structure:
183
- {{
184
- "status": "insufficient",
185
- "reason": "Brief explanation (e.g., 'Text is primarily self-introduction', 'Text lacks substantive content')"
186
- }}
187
-
188
- IF THE TEXT HAS SUFFICIENT MEANINGFUL CONTENT:
189
- 1. First, do text segmentation and identify DISTINCT key topics within the text
190
- 2. For each segment/topic you identify:
191
- - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
192
- - List 3-5 key concepts discussed in that segment
193
- - Write a brief summary of that segment (3-5 sentences)
194
- - Create 5 quiz questions based DIRECTLY on the content in that segment
195
-
196
- For each quiz question:
197
- - Create one correct answer that comes DIRECTLY from the text
198
- - Create two plausible but incorrect answers
199
- - IMPORTANT: Ensure all answer options have similar length (± 3 words)
200
- - Ensure the correct answer is clearly indicated
201
-
202
- Text:
203
- {cluster_text}
204
-
205
- Format your response as JSON with the following structure:
206
- {{
207
- "segments": [
208
- {{
209
- "topic_name": "Name of segment 1",
210
- "key_concepts": ["concept1", "concept2", "concept3"],
211
- "summary": "Brief summary of this segment.",
212
- "quiz_questions": [
213
- {{
214
- "question": "Question text?",
215
- "options": [
216
- {{
217
- "text": "Option A",
218
- "correct": false
219
- }},
220
- {{
221
- "text": "Option B",
222
- "correct": true
223
- }},
224
- {{
225
- "text": "Option C",
226
- "correct": false
227
- }}
228
- ]
229
- }},
230
- // More questions...
231
- ]
232
- }},
233
- // More segments...
234
- ]
235
- }}
236
- """
237
- else:
238
- prompt = f"""
239
- Analyze the following text segment and provide:
240
- FIRST ASSESS THE TEXT:
241
- - Is it primarily self-introduction, biographical information, or conclusion?
242
- - Does it lack meaningful content for analysis?
243
-
244
- IF THE TEXT IS INSUFFICIENT (introductory, concluding, or lacking substance):
245
- Return ONLY this JSON structure:
246
- {{
247
- "status": "insufficient",
248
- "reason": "Brief explanation (e.g., 'Text is primarily self-introduction', 'Text lacks substantive content')"
249
- }}
250
-
251
- IF THE TEXT HAS SUFFICIENT MEANINGFUL CONTENT:
252
- 1. A SPECIFIC and DESCRIPTIVE topic name (3-5 words) that precisely captures the main focus
253
- 2. 3-5 key concepts discussed
254
- 3. A brief summary (6-7 sentences)
255
- 4. Create 5 quiz questions based DIRECTLY on the text content (not from your summary)
256
-
257
- For each quiz question:
258
- - Create one correct answer that comes DIRECTLY from the text
259
- - Create two plausible but incorrect answers
260
- - IMPORTANT and STRICTLY: Ensure all answer options have similar length (± 3 words)
261
- - Ensure the correct answer is clearly indicated
262
-
263
- Text segment:
264
- {cluster_text}
265
-
266
- Format your response as JSON with the following structure:
267
- {{
268
- "topic_name": "Name of the topic",
269
- "key_concepts": ["concept1", "concept2", "concept3"],
270
- "summary": "Brief summary of the text segment.",
271
- "quiz_questions": [
272
- {{
273
- "question": "Question text?",
274
- "options": [
275
- {{
276
- "text": "Option A",
277
- "correct": false
278
- }},
279
- {{
280
- "text": "Option B",
281
- "correct": true
282
- }},
283
- {{
284
- "text": "Option C",
285
- "correct": false
286
- }}
287
- ]
288
- }},
289
- // More questions...
290
- ]
291
- }}
292
- """
293
-
294
- response = llm.invoke(prompt)
295
 
 
296
  response_text = response.content
297
 
298
  try:
299
  json_match = re.search(r'\{[\s\S]*\}', response_text)
300
  if json_match:
301
- response_json = json.loads(json_match.group(0))
302
- else:
303
- response_json = json.loads(response_text)
304
-
305
- return response_json
306
- except json.JSONDecodeError as e:
307
- print(f"Error parsing JSON response: {e}")
308
- print(f"Raw response: {response_text}")
309
-
310
- if is_full_text:
311
- return {
312
- "segments": [
313
- {
314
- "topic_name": "JSON Parsing Error",
315
- "key_concepts": ["Error in response format"],
316
- "summary": f"Could not parse the API response. Raw text: {response_text[:200]}...",
317
- "quiz_questions": []
318
- }
319
- ]
320
- }
321
  else:
322
- return {
323
- "topic_name": "JSON Parsing Error",
324
- "key_concepts": ["Error in response format"],
325
- "summary": f"Could not parse the API response. Raw text: {response_text[:200]}...",
326
- "quiz_questions": []
327
- }
 
 
 
 
 
 
328
 
329
  def process_document_with_quiz(text):
 
 
330
  token_count = len(tokenizer.encode(text))
331
- print(f"Text contains {token_count} tokens")
332
 
333
- if token_count < 8000:
334
- print("Text is short enough to analyze directly without text segmentation")
335
- full_analysis = analyze_segment_with_gemini(text, is_full_text=True)
336
-
337
- results = []
338
-
339
- if "segments" in full_analysis:
340
- for i, segment in enumerate(full_analysis["segments"]):
341
- segment["segment_number"] = i + 1
342
- segment["segment_text"] = "Segment identified by Gemini"
343
- results.append(segment)
344
-
345
- print(f"Gemini identified {len(results)} segments in the text")
346
- else:
347
- print("Unexpected response format from Gemini")
348
- results = [full_analysis]
349
 
350
- return results
351
-
352
- chunks = semantic_chunking(text)
353
- print(f"{len(chunks)} semantic chunks were found\n")
 
 
354
 
355
- results = []
 
356
 
357
- for i, chunk in enumerate(chunks):
358
- print(f"Analyzing segment {i+1}/{len(chunks)}...")
359
- analysis = analyze_segment_with_gemini(chunk, is_full_text=False)
360
- analysis["segment_number"] = i + 1
361
- analysis["segment_text"] = chunk
362
 
363
- results.append(analysis)
364
 
365
- print(f"Completed analysis of segment {i+1}: {analysis['topic_name']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
- return results
368
-
369
- def save_results_to_file(results, output_file="analysis_results.json"):
370
- with open(output_file, "w", encoding="utf-8") as f:
371
- json.dump(results, f, indent=2, ensure_ascii=False)
372
 
373
- print(f"Results saved to {output_file}")
374
 
375
 
376
  def format_quiz_for_display(results):
377
  output = []
378
 
379
- for segment_result in results:
380
- segment_num = segment_result["segment_number"]
381
- topic = segment_result["topic_name"]
382
 
383
  output.append(f"\n\n{'='*40}")
384
  output.append(f"SEGMENT {segment_num}: {topic}")
385
  output.append(f"{'='*40}\n")
386
 
387
  output.append("KEY CONCEPTS:")
388
- for concept in segment_result["key_concepts"]:
389
  output.append(f"• {concept}")
390
 
391
  output.append("\nSUMMARY:")
392
- output.append(segment_result["summary"])
393
 
394
  output.append("\nQUIZ QUESTIONS:")
395
- for i, q in enumerate(segment_result["quiz_questions"]):
396
  output.append(f"\n{i+1}. {q['question']}")
397
 
398
  for j, option in enumerate(q['options']):
@@ -402,22 +212,52 @@ def format_quiz_for_display(results):
402
 
403
  return "\n".join(output)
404
 
 
 
 
 
 
 
 
 
 
 
405
 
406
- def analyze_document(document_text: str, api_key: str) -> tuple:
 
 
 
407
  os.environ["GOOGLE_API_KEY"] = api_key
408
  try:
409
  results = process_document_with_quiz(document_text)
410
  formatted_output = format_quiz_for_display(results)
 
411
  json_path = "analysis_results.json"
412
  txt_path = "analysis_results.txt"
 
413
  with open(json_path, "w", encoding="utf-8") as f:
414
  json.dump(results, f, indent=2, ensure_ascii=False)
 
415
  with open(txt_path, "w", encoding="utf-8") as f:
416
  f.write(formatted_output)
417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  return formatted_output, json_path, txt_path
419
  except Exception as e:
420
  error_msg = f"Error processing document: {str(e)}"
 
421
  return error_msg, None, None
422
 
423
  with gr.Blocks(title="Quiz Generator") as app:
@@ -426,17 +266,19 @@ with gr.Blocks(title="Quiz Generator") as app:
426
  with gr.Row():
427
  with gr.Column():
428
  input_text = gr.Textbox(
429
- label="Input Text",
430
  placeholder="Paste your document text here...",
431
  lines=10
432
  )
 
433
  api_key = gr.Textbox(
434
  label="Gemini API Key",
435
  placeholder="Enter your Gemini API key",
436
  type="password"
437
  )
 
438
  analyze_btn = gr.Button("Analyze Document")
439
-
440
  with gr.Column():
441
  output_results = gr.Textbox(
442
  label="Analysis Results",
 
3
  import json
4
  from sentence_transformers import SentenceTransformer
5
  from transformers import AutoTokenizer
 
 
6
  from langchain_google_genai import ChatGoogleGenerativeAI
7
  import os
8
  import gradio as gr
9
+ import time
10
 
11
 
12
  tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
13
  sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
 
14
 
15
  def clean_text(text):
16
  text = re.sub(r'\[speaker_\d+\]', '', text)
17
  text = re.sub(r'\s+', ' ', text).strip()
18
  return text
19
 
20
+ def split_text_by_tokens(text, max_tokens=8000):
21
+ text = clean_text(text)
22
+ tokens = tokenizer.encode(text)
23
 
24
+ if len(tokens) <= max_tokens:
25
+ return [text]
 
26
 
27
+ split_point = len(tokens) // 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ sentences = re.split(r'(?<=[.!?])\s+', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ first_half = []
32
+ second_half = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ current_tokens = 0
35
+ for sentence in sentences:
36
+ sentence_tokens = len(tokenizer.encode(sentence))
37
 
38
+ if current_tokens + sentence_tokens <= split_point:
39
+ first_half.append(sentence)
40
+ current_tokens += sentence_tokens
41
+ else:
42
+ second_half.append(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ return [" ".join(first_half), " ".join(second_half)]
45
 
46
+ def analyze_segment_with_gemini(segment_text):
47
  llm = ChatGoogleGenerativeAI(
48
  model="gemini-1.5-flash",
49
  temperature=0.7,
 
51
  timeout=None,
52
  max_retries=3
53
  )
 
 
 
 
 
 
54
 
55
+ prompt = f"""
56
+ Analyze the following text and identify distinct segments within it and do text segmentation:
57
+ 1. Segments should be STRICTLY max=10
58
+ 2. For each segment/topic you identify:
59
+ - Provide a SPECIFIC and UNIQUE topic name (3-5 words) that clearly differentiates it from other segments
60
+ - List 3-5 key concepts discussed in that segment (be precise and avoid repetition between segments)
61
+ - Write a brief summary of that segment (3-5 sentences)
62
+ - Create 5 quiz questions based DIRECTLY on the content in that segment only
63
 
64
+ For each quiz question:
65
+ - Create one correct answer that comes DIRECTLY from the text
66
+ - Create two plausible but incorrect answers
67
+ - IMPORTANT: Ensure all answer options have similar length (± 3 words)
68
+ - Ensure the correct answer is clearly indicated with a ✓ symbol
69
+
70
+ Text:
71
+ {segment_text}
72
+
73
+ Format your response as JSON with the following structure:
74
+ {{
75
+ "segments": [
76
+ {{
77
+ "topic_name": "Unique and Specific Topic Name",
78
+ "key_concepts": ["concept1", "concept2", "concept3"],
79
+ "summary": "Brief summary of this segment.",
80
+ "quiz_questions": [
81
+ {{
82
+ "question": "Question text?",
83
+ "options": [
84
+ {{
85
+ "text": "Option A",
86
+ "correct": false
87
+ }},
88
+ {{
89
+ "text": "Option B",
90
+ "correct": true
91
+ }},
92
+ {{
93
+ "text": "Option C",
94
+ "correct": false
95
+ }}
96
+ ]
97
+ }}
98
+ ]
99
+ }}
100
+ ]
101
+ }}
102
+
103
+ IMPORTANT: Each segment must have a DISTINCT topic name that clearly differentiates it from others.
104
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ response = llm.invoke(prompt)
107
  response_text = response.content
108
 
109
  try:
110
  json_match = re.search(r'\{[\s\S]*\}', response_text)
111
  if json_match:
112
+ return json.loads(json_match.group(0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  else:
114
+ return json.loads(response_text)
115
+ except json.JSONDecodeError:
116
+ return {
117
+ "segments": [
118
+ {
119
+ "topic_name": "JSON Parsing Error",
120
+ "key_concepts": ["Error in response format"],
121
+ "summary": "Could not parse the API response.",
122
+ "quiz_questions": []
123
+ }
124
+ ]
125
+ }
126
 
127
  def process_document_with_quiz(text):
128
+ start_time = time.time()
129
+
130
  token_count = len(tokenizer.encode(text))
131
+ print(f"[LOG] Total document tokens: {token_count}")
132
 
133
+ if token_count > 8000:
134
+ print(f"[LOG] Document exceeds 8000 tokens. Splitting into parts.")
135
+ parts = split_text_by_tokens(text)
136
+ print(f"[LOG] Document split into {len(parts)} parts")
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ for i, part in enumerate(parts):
139
+ part_tokens = len(tokenizer.encode(part))
140
+ print(f"[LOG] Part {i+1} contains {part_tokens} tokens")
141
+ else:
142
+ print(f"[LOG] Document under 8000 tokens. Processing as a single part.")
143
+ parts = [text]
144
 
145
+ all_segments = []
146
+ segment_counter = 1
147
 
148
+ for i, part in enumerate(parts):
149
+ part_start_time = time.time()
150
+ print(f"[LOG] Processing part {i+1}...")
 
 
151
 
152
+ analysis = analyze_segment_with_gemini(part)
153
 
154
+ if "segments" in analysis:
155
+ print(f"[LOG] Found {len(analysis['segments'])} segments in part {i+1}")
156
+
157
+ for segment in analysis["segments"]:
158
+ segment["segment_number"] = segment_counter
159
+ all_segments.append(segment)
160
+ print(f"[LOG] Segment {segment_counter}: {segment['topic_name']}")
161
+ segment_counter += 1
162
+ else:
163
+ # Fallback if response format is unexpected
164
+ print(f"[LOG] Error: Unexpected format in part {i+1} analysis")
165
+ fallback_segment = {
166
+ "topic_name": f"Segment {segment_counter} Analysis",
167
+ "key_concepts": ["Format error in analysis"],
168
+ "summary": "Could not properly segment this part of the text.",
169
+ "quiz_questions": [],
170
+ "segment_number": segment_counter
171
+ }
172
+ all_segments.append(fallback_segment)
173
+ print(f"[LOG] Added fallback segment {segment_counter}")
174
+ segment_counter += 1
175
+
176
+ part_time = time.time() - part_start_time
177
+ print(f"[LOG] Part {i+1} processed in {part_time:.2f} seconds")
178
 
179
+ total_time = time.time() - start_time
180
+ print(f"[LOG] Total processing time: {total_time:.2f} seconds")
181
+ print(f"[LOG] Generated {len(all_segments)} segments total")
 
 
182
 
183
+ return all_segments
184
 
185
 
186
  def format_quiz_for_display(results):
187
  output = []
188
 
189
+ for segment in results:
190
+ topic = segment["topic_name"]
191
+ segment_num = segment["segment_number"]
192
 
193
  output.append(f"\n\n{'='*40}")
194
  output.append(f"SEGMENT {segment_num}: {topic}")
195
  output.append(f"{'='*40}\n")
196
 
197
  output.append("KEY CONCEPTS:")
198
+ for concept in segment["key_concepts"]:
199
  output.append(f"• {concept}")
200
 
201
  output.append("\nSUMMARY:")
202
+ output.append(segment["summary"])
203
 
204
  output.append("\nQUIZ QUESTIONS:")
205
+ for i, q in enumerate(segment["quiz_questions"]):
206
  output.append(f"\n{i+1}. {q['question']}")
207
 
208
  for j, option in enumerate(q['options']):
 
212
 
213
  return "\n".join(output)
214
 
215
+ def save_results_as_json(results, filename="analysis_results.json"):
216
+ with open(filename, "w", encoding="utf-8") as f:
217
+ json.dump(results, f, indent=2, ensure_ascii=False)
218
+ return filename
219
+
220
+ def save_results_as_txt(formatted_text, filename="analysis_results.txt"):
221
+ with open(filename, "w", encoding="utf-8") as f:
222
+ f.write(formatted_text)
223
+ return filename
224
+
225
 
226
+ def analyze_document(document_text, api_key):
227
+ print(f"[LOG] Starting document analysis...")
228
+ overall_start_time = time.time()
229
+
230
  os.environ["GOOGLE_API_KEY"] = api_key
231
  try:
232
  results = process_document_with_quiz(document_text)
233
  formatted_output = format_quiz_for_display(results)
234
+
235
  json_path = "analysis_results.json"
236
  txt_path = "analysis_results.txt"
237
+
238
  with open(json_path, "w", encoding="utf-8") as f:
239
  json.dump(results, f, indent=2, ensure_ascii=False)
240
+
241
  with open(txt_path, "w", encoding="utf-8") as f:
242
  f.write(formatted_output)
243
 
244
+ overall_time = time.time() - overall_start_time
245
+ print(f"[LOG] Document analysis completed in {overall_time:.2f} seconds")
246
+
247
+ topics_summary = "DOCUMENT ANALYSIS SUMMARY:\n"
248
+ topics_summary += f"Total segments: {len(results)}\n"
249
+ topics_summary += f"Processing time: {overall_time:.2f} seconds\n\n"
250
+ topics_summary += "SEGMENTS:\n"
251
+
252
+ for segment in results:
253
+ topics_summary += f"- Segment {segment['segment_number']}: {segment['topic_name']}\n"
254
+
255
+ formatted_output = topics_summary + "\n" + formatted_output
256
+
257
  return formatted_output, json_path, txt_path
258
  except Exception as e:
259
  error_msg = f"Error processing document: {str(e)}"
260
+ print(f"[LOG] ERROR: {error_msg}")
261
  return error_msg, None, None
262
 
263
  with gr.Blocks(title="Quiz Generator") as app:
 
266
  with gr.Row():
267
  with gr.Column():
268
  input_text = gr.Textbox(
269
+ label="Input Document Text",
270
  placeholder="Paste your document text here...",
271
  lines=10
272
  )
273
+
274
  api_key = gr.Textbox(
275
  label="Gemini API Key",
276
  placeholder="Enter your Gemini API key",
277
  type="password"
278
  )
279
+
280
  analyze_btn = gr.Button("Analyze Document")
281
+
282
  with gr.Column():
283
  output_results = gr.Textbox(
284
  label="Analysis Results",