Spaces:

sambanovasystems
/

auto-web-search

Running

App Files Files Community

zolicsaki commited on Mar 27

Commit

cf18115

verified ·

1 Parent(s): 3ce4a16

Delete pdf_helper.py

Browse files

Files changed (1) hide show

pdf_helper.py +0 -181

pdf_helper.py DELETED Viewed

@@ -1,181 +0,0 @@
-import pdf4llm
-import re
-def py4llm_pdf_reader(pdf_path: str):
-    md_text = pdf4llm.to_markdown(pdf_path)
-    return md_text
-def split_markdown_sections(text):
-    # Regex to match headers (e.g., #, ##, ###)
-    header_pattern = r'^(#{1,6})\s*(.+)$'
-    # Find all headers and their positions
-    matches = list(re.finditer(header_pattern, text, re.MULTILINE))
-    sections = []
-    # Iterate over all header matches and split text
-    for i, match in enumerate(matches):
-        header = match.group(0)  # Full header text: number of # and header name
-        level = len(match.group(1))  # Header level (number of #)
-        title = match.group(2)  # Header title
-        # Find the start position of the section (right after the header)
-        start_pos = match.end()
-        # Find the end position (start of the next header or end of the document)
-        if i + 1 < len(matches):
-            end_pos = matches[i + 1].start()
-        else:
-            end_pos = len(text)
-        # Extract section content between this header and the next one
-        section_content = text[start_pos:end_pos].strip()
-        # Store the section as a tuple: (header level, header title, section content)
-        sections.append({'level': level, 'title': title, 'content': section_content})
-    return sections
-class PDFPaper4LLMParser(object):
-    def __init__(self, write_images=False, page_chunks=False) -> None:
-        self.write_images = write_images
-        self.page_chunks = page_chunks
-    def pdf2text(self, pdf_path: str):
-        md_text = pdf4llm.to_markdown(pdf_path, write_images=self.write_images, page_chunks=self.page_chunks)
-        if self.page_chunks:
-            text_array = []
-            for md_text_i in md_text:
-                text_array.append(md_text_i['text'])
-            markdown_text = '\n'.join(text_array)
-        else:
-            markdown_text = md_text
-        return markdown_text
-    def structured_paper_content(self, markdown_sections: list):
-        """
-        markdown_sections: list of dictionary, each dictionary consists of
-        1. level
-        2. title
-        3. content
-        Title, Author, Abstract, Section_i (i = 1, 2, 3, ...)
-        """
-        assert len(markdown_sections) > 0
-        struct_sections = {}
-        start_section = markdown_sections[0]
-        title_level = start_section['level']
-        main_text_idx = -1
-        meta_data = []
-        for sec_idx, section in enumerate(markdown_sections):
-            level_i = section['level']
-            title_i = section['title']
-            content_i = section['content']
-            if level_i == title_level and sec_idx == 0:
-                struct_sections['title'] = title_i
-                if len(content_i) > 0:
-                    meta_data.append(content_i)
-            else:
-                if 'abstract' in title_i.lower() or 'abstract' in content_i.lower():
-                    struct_sections['abstract'] = content_i
-                    main_text_idx = sec_idx + 1
-                    break
-                else:
-                    meta_data.append(title_i + content_i)
-        struct_sections['author'] = meta_data
-        if main_text_idx == -1 and len(markdown_sections) > 0:
-            main_text_idx = 0
-        assert main_text_idx >= 0
-        main_text_list = markdown_sections[main_text_idx:]
-        struct_sections['main_text'] = main_text_list
-        return struct_sections
-    def run(self, pdf_path: str, verbose=True):
-        markdown_text = self.pdf2text(pdf_path=pdf_path)
-        sections = split_markdown_sections(text=markdown_text)
-        struct_sections = self.structured_paper_content(markdown_sections=sections)
-        if verbose:
-            paper_text = ''
-            for k, v in struct_sections.items():
-                if k == 'title':
-                    paper_text += '\nTitle: ' + v + '\n\n'
-                elif k == 'abstract':
-                    paper_text += '\nAbstract: \n'  + v + '\n\n'
-                elif k == 'author':
-                    paper_text += '\nAuthor: \n'  + '\n'.join(v) + '\n\n'
-                elif k == 'main_text':
-                    for section in v:
-                        paper_text += '\n' + section['title'] + '\n\n' + section['content'] + '\n\n'
-            print(paper_text)
-        return struct_sections
-def dict_to_markdown_list(d: dict, indent=0):
-    lines = []
-    for key, value in d.items():
-        prefix = '  ' * indent + f"- **{key}**: "
-        if isinstance(value, dict):
-            lines.append(prefix)
-            lines.append(dict_to_markdown_list(value, indent + 1))
-        else:
-            lines.append(prefix + str(value))
-    return "\n".join(lines)
-def split_markdown_slides(markdown: str, sep: str = "<slide_sep>"):
-    return [slide.strip() for slide in markdown.strip().split(sep) if slide.strip()]
-def parse_slide_to_dict(slide: str):
-    lines = slide.splitlines()
-    result = {}
-    current_key = None
-    sub_items = []
-    for line in lines:
-        line = line.strip()
-        # Capture headings (### or ##)
-        heading_match = re.match(r"^#{2,3}\s+(.*)", line)
-        if heading_match:
-            if current_key and sub_items:
-                result[current_key] = sub_items
-                sub_items = []
-            current_key = heading_match.group(1).strip()
-            continue
-        # Capture numbered list
-        numbered_match = re.match(r"^\d+\.\s+(.*)", line)
-        if numbered_match:
-            sub_items.append(numbered_match.group(1).strip())
-            continue
-        # Capture bulleted list
-        bullet_match = re.match(r"^[\*\-]\s+(.*)", line)
-        if bullet_match:
-            sub_items.append(bullet_match.group(1).strip())
-            continue
-        # Capture nested bullets
-        nested_bullet_match = re.match(r"^\s{2,}[\*\-]\s+(.*)", line)
-        if nested_bullet_match:
-            sub_items.append(nested_bullet_match.group(1).strip())
-            continue
-        # Fallback: add as freeform text
-        if current_key:
-            sub_items.append(line)
-    # Save the last block
-    if current_key and sub_items:
-        result[current_key] = sub_items
-    return result
-def markdown_to_slide_dicts(full_markdown: str):
-    slides = split_markdown_slides(full_markdown)
-    return [parse_slide_to_dict(slide) for slide in slides]