SkyNait commited on
Commit
77844f2
Β·
verified Β·
1 Parent(s): 2a20a48

Update topic_extraction.py

Browse files
Files changed (1) hide show
  1. topic_extraction.py +89 -31
topic_extraction.py CHANGED
@@ -4,28 +4,21 @@ import re
4
  import gc
5
  import json
6
  import logging
7
- import fitz # PyMuPDF (pip install pymupdf)
8
  import base64
9
  import concurrent.futures
10
  from io import BytesIO
11
  from typing import List, Dict, Any
12
 
13
- # Attempt to import google.genai
14
- try:
15
- from google import genai
16
- from google.genai import types
17
- except ImportError:
18
- genai = None
19
- types = None
20
 
21
  import torch
22
  import cv2
23
 
24
- # Magic PDF pipeline
25
  from magic_pdf.data.dataset import PymuDocDataset
26
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
27
 
28
- # Your TableExtractor from topic_extraction_upgrade (or similar)
29
  from table_row_extraction import TableExtractor
30
 
31
  logging.basicConfig(level=logging.INFO)
@@ -87,19 +80,92 @@ class GeminiTopicExtractor:
87
 
88
  prompt = f"""
89
  You are given the text of a specification PDF.
90
- Identify the '2 Subject content and assessment information' topic.
91
- Under that topic, identify subtopics (like 'Paper 1 and Paper 2: Pure Mathematics', etc.)
92
- and their page ranges (1-based) from the text.
93
- Return JSON only, with structure:
94
- {{
95
- "2 Subject content and assessment information": {{
96
- "Paper 1 and Paper 2: Pure Mathematics": [start_page, end_page],
97
- "Paper 3: Statistics and Mechanics": [start_page, end_page]
98
- }}
99
- }}
100
- No extra explanation, just JSON.
101
- TEXT:
102
- {text_content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  """
104
 
105
  try:
@@ -133,10 +199,6 @@ TEXT:
133
  logger.error(f"Could not open/read PDF: {e}")
134
  return "\n".join(text_parts)
135
 
136
-
137
- # -------------------------------------------------------------------
138
- # Gemini-based table classification (Mineru style)
139
- # -------------------------------------------------------------------
140
  def call_gemini_for_table_classification(image_data: bytes) -> str:
141
  if genai is None or types is None:
142
  logger.warning("Gemini not available. Returning NO_TABLE.")
@@ -482,10 +544,6 @@ class MineruNoTextProcessor:
482
  pages.append(p)
483
  return pages
484
 
485
-
486
- # -------------------------------------------------------------------
487
- # Example usage
488
- # -------------------------------------------------------------------
489
  if __name__ == "__main__":
490
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
491
  output_dir = "/home/user/app/input_output/output"
 
4
  import gc
5
  import json
6
  import logging
7
+ import fitz
8
  import base64
9
  import concurrent.futures
10
  from io import BytesIO
11
  from typing import List, Dict, Any
12
 
13
+ from google import genai
14
+ from google.genai import types
 
 
 
 
 
15
 
16
  import torch
17
  import cv2
18
 
 
19
  from magic_pdf.data.dataset import PymuDocDataset
20
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
21
 
 
22
  from table_row_extraction import TableExtractor
23
 
24
  logging.basicConfig(level=logging.INFO)
 
80
 
81
  prompt = f"""
82
  You are given the text of a specification PDF.
83
+ Instructions:
84
+ 1. Identify the "Contents" section, which lists all topics, subtopics, and their corresponding pages.
85
+ 2. Extract only the **highest-level, subject-related subtopics** (ignore organizational or administrative sections).
86
+ 3. For subtopics, include the full range of pages from the first to the last subtopic.
87
+ 4. Return the output in the following JSON format:
88
+
89
+ {{
90
+ "topic_name": [start_page, end_page]
91
+ }}
92
+
93
+ Important Notes:
94
+ - Ignore non-subject-related sections (e.g., "Introduction", "Exam Guidelines", "Appendices", "Assessment, Qualification at a glance").
95
+ - The extracted subtopics should represent major academic areas, not organizational or structural elements.
96
+ - Make sure that all of the pages for a subtopic are included, end page should be the start page of the topic
97
+ that comes next after the extracted one in contents section.
98
+
99
+ Examples:
100
+ 1. Given this table of contents:
101
+ 1 Introduction – 2
102
+ Why choose Edexcel A Level Mathematics? - 2
103
+ Supporting you in planning and implementing this qualification - 3
104
+ Qualification at a glance - 5
105
+ 2 Subject content and assessment information – 7
106
+ Paper 1 and Paper 2: Pure Mathematics - 11
107
+ Paper 3: Statistics and Mechanics - 30
108
+ Assessment Objectives - 40
109
+ 3 Administration and general information – 42
110
+ Entries - 42
111
+ Access arrangements, reasonable adjustments, special consideration and malpractice - 42
112
+ Student recruitment and progression - 45
113
+ Appendix 1: Formulae – 49
114
+ Appendix 2: Notation – 53
115
+ Appendix 3: Use of calculators – 59
116
+ Appendix 4: Assessment Objectives – 60
117
+ Appendix 5: The context for the development of this qualification – 62
118
+ Appendix 6: Transferable skills – 64
119
+ Appendix 7: Level 3 Extended Project qualification – 65
120
+ Appendix 8: Codes – 67
121
+
122
+ The correct output should be:
123
+
124
+ {{
125
+ "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
126
+ "Paper 3: Statistics and Mechanics": [30, 42]
127
+ }}
128
+
129
+ 2. Given this table of contents:
130
+ Qualification at a glance – 1
131
+ Assessment Objectives and weightings - 4
132
+ Knowledge, skills and understanding – 5
133
+ Theme 1: Introduction to markets and market failure - 5
134
+ Theme 2: The UK economy – performance and policies - 11
135
+ Theme 3: Business behaviour and the labour market - 21
136
+ Theme 4: A global perspective - 29
137
+ Assessment – 39
138
+ Assessment summary - 39
139
+ Assessment objectives - 41
140
+ Assessment overview - 42
141
+ Breakdown of assessment objectives - 42
142
+ Synoptic assessment - 43
143
+ Discount code and performance tables - 43
144
+ Access arrangements, reasonable adjustments and special consideration - 44
145
+ Malpractice - 45
146
+ Equality Act 2010 and Pearson equality policy - 45
147
+ Synoptic assessment - 46
148
+ Awarding and reporting - 47
149
+ Other information – 49
150
+ Student recruitment -49
151
+ Prior learning and other requirements -49
152
+ Progression - 49
153
+ Appendix 1: Transferable skills – 53
154
+ Appendix 2: Level 3 Extended Project qualification – 55
155
+ Appendix 3: Quantitative skills – 59
156
+ Appendix 4: Codes – 61
157
+ Appendix 5: Index – 63
158
+
159
+ The correct output should be:
160
+
161
+ {{
162
+ "Theme 1: Introduction to markets and market failure": [5, 10]
163
+ "Theme 2: The UK economy – performance and policies": - [11, 20]
164
+ "Theme 3: Business behaviour and the labour market": [21, 28]
165
+ "Theme 4: A global perspective": [29, 38]
166
+ }}
167
+
168
+ Now, extract topics from this text:{text_content}
169
  """
170
 
171
  try:
 
199
  logger.error(f"Could not open/read PDF: {e}")
200
  return "\n".join(text_parts)
201
 
 
 
 
 
202
  def call_gemini_for_table_classification(image_data: bytes) -> str:
203
  if genai is None or types is None:
204
  logger.warning("Gemini not available. Returning NO_TABLE.")
 
544
  pages.append(p)
545
  return pages
546
 
 
 
 
 
547
  if __name__ == "__main__":
548
  input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
549
  output_dir = "/home/user/app/input_output/output"