zolicsaki commited on
Commit
cf18115
·
verified ·
1 Parent(s): 3ce4a16

Delete pdf_helper.py

Browse files
Files changed (1) hide show
  1. pdf_helper.py +0 -181
pdf_helper.py DELETED
@@ -1,181 +0,0 @@
1
- import pdf4llm
2
- import re
3
-
4
- def py4llm_pdf_reader(pdf_path: str):
5
- md_text = pdf4llm.to_markdown(pdf_path)
6
- return md_text
7
-
8
- def split_markdown_sections(text):
9
- # Regex to match headers (e.g., #, ##, ###)
10
- header_pattern = r'^(#{1,6})\s*(.+)$'
11
-
12
- # Find all headers and their positions
13
- matches = list(re.finditer(header_pattern, text, re.MULTILINE))
14
-
15
- sections = []
16
-
17
- # Iterate over all header matches and split text
18
- for i, match in enumerate(matches):
19
- header = match.group(0) # Full header text: number of # and header name
20
- level = len(match.group(1)) # Header level (number of #)
21
- title = match.group(2) # Header title
22
-
23
- # Find the start position of the section (right after the header)
24
- start_pos = match.end()
25
-
26
- # Find the end position (start of the next header or end of the document)
27
- if i + 1 < len(matches):
28
- end_pos = matches[i + 1].start()
29
- else:
30
- end_pos = len(text)
31
-
32
- # Extract section content between this header and the next one
33
- section_content = text[start_pos:end_pos].strip()
34
-
35
- # Store the section as a tuple: (header level, header title, section content)
36
- sections.append({'level': level, 'title': title, 'content': section_content})
37
-
38
- return sections
39
-
40
-
41
- class PDFPaper4LLMParser(object):
42
- def __init__(self, write_images=False, page_chunks=False) -> None:
43
- self.write_images = write_images
44
- self.page_chunks = page_chunks
45
-
46
- def pdf2text(self, pdf_path: str):
47
- md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
48
- if self.page_chunks:
49
- text_array = []
50
- for md_text_i in md_text:
51
- text_array.append(md_text_i['text'])
52
- markdown_text = '\n'.join(text_array)
53
- else:
54
- markdown_text = md_text
55
- return markdown_text
56
-
57
- def structured_paper_content(self, markdown_sections: list):
58
- """
59
- markdown_sections: list of dictionary, each dictionary consists of
60
- 1. level
61
- 2. title
62
- 3. content
63
-
64
- Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
65
- """
66
- assert len(markdown_sections) > 0
67
- struct_sections = {}
68
- start_section = markdown_sections[0]
69
- title_level = start_section['level']
70
-
71
- main_text_idx = -1
72
- meta_data = []
73
- for sec_idx, section in enumerate(markdown_sections):
74
- level_i = section['level']
75
- title_i = section['title']
76
- content_i = section['content']
77
- if level_i == title_level and sec_idx == 0:
78
- struct_sections['title'] = title_i
79
- if len(content_i) > 0:
80
- meta_data.append(content_i)
81
- else:
82
- if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
83
- struct_sections['abstract'] = content_i
84
- main_text_idx = sec_idx + 1
85
- break
86
- else:
87
- meta_data.append(title_i + content_i)
88
- struct_sections['author'] = meta_data
89
- if main_text_idx == -1 and len(markdown_sections) > 0:
90
- main_text_idx = 0
91
- assert main_text_idx >= 0
92
- main_text_list = markdown_sections[main_text_idx:]
93
- struct_sections['main_text'] = main_text_list
94
- return struct_sections
95
-
96
- def run(self, pdf_path: str, verbose=True):
97
- markdown_text = self.pdf2text(pdf_path=pdf_path)
98
- sections = split_markdown_sections(text=markdown_text)
99
- struct_sections = self.structured_paper_content(markdown_sections=sections)
100
- if verbose:
101
- paper_text = ''
102
- for k, v in struct_sections.items():
103
- if k == 'title':
104
- paper_text += '\nTitle: ' + v + '\n\n'
105
- elif k == 'abstract':
106
- paper_text += '\nAbstract: \n' + v + '\n\n'
107
- elif k == 'author':
108
- paper_text += '\nAuthor: \n' + '\n'.join(v) + '\n\n'
109
- elif k == 'main_text':
110
- for section in v:
111
- paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
112
- print(paper_text)
113
- return struct_sections
114
-
115
-
116
- def dict_to_markdown_list(d: dict, indent=0):
117
- lines = []
118
- for key, value in d.items():
119
- prefix = ' ' * indent + f"- **{key}**: "
120
- if isinstance(value, dict):
121
- lines.append(prefix)
122
- lines.append(dict_to_markdown_list(value, indent + 1))
123
- else:
124
- lines.append(prefix + str(value))
125
- return "\n".join(lines)
126
-
127
-
128
- def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
129
- return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]
130
-
131
-
132
- def parse_slide_to_dict(slide: str):
133
- lines = slide.splitlines()
134
- result = {}
135
- current_key = None
136
- sub_items = []
137
-
138
- for line in lines:
139
- line = line.strip()
140
-
141
- # Capture headings (### or ##)
142
- heading_match = re.match(r"^#{2,3}\s+(.*)", line)
143
- if heading_match:
144
- if current_key and sub_items:
145
- result[current_key] = sub_items
146
- sub_items = []
147
- current_key = heading_match.group(1).strip()
148
- continue
149
-
150
- # Capture numbered list
151
- numbered_match = re.match(r"^\d+\.\s+(.*)", line)
152
- if numbered_match:
153
- sub_items.append(numbered_match.group(1).strip())
154
- continue
155
-
156
- # Capture bulleted list
157
- bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
158
- if bullet_match:
159
- sub_items.append(bullet_match.group(1).strip())
160
- continue
161
-
162
- # Capture nested bullets
163
- nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
164
- if nested_bullet_match:
165
- sub_items.append(nested_bullet_match.group(1).strip())
166
- continue
167
-
168
- # Fallback: add as freeform text
169
- if current_key:
170
- sub_items.append(line)
171
-
172
- # Save the last block
173
- if current_key and sub_items:
174
- result[current_key] = sub_items
175
-
176
- return result
177
-
178
-
179
- def markdown_to_slide_dicts(full_markdown: str):
180
- slides = split_markdown_slides(full_markdown)
181
- return [parse_slide_to_dict(slide) for slide in slides]