import os import re import json import subprocess import glob from pathlib import Path from concurrent.futures import ProcessPoolExecutor from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain.schema import Document import shutil import tempfile from .path_utils import get_path class DocumentLoading: def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"): base_name = os.path.splitext(os.path.basename(pdf_file))[0] target_dir = os.path.join(output_dir, base_name) md_file_path = os.path.join(target_dir, method, f"{base_name}.md") print("The md file path is: ", md_file_path) if os.path.exists(md_file_path): print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True) return command = ["mineru", "-p", pdf_file, "-o", output_dir, "-m", method] try: subprocess.run(command, check=True) # 检查是否生成了 Markdown 文件 if not os.path.exists(md_file_path): print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...") shutil.rmtree(target_dir) # 删除生成的文件夹 else: print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.") except subprocess.CalledProcessError as e: print(f"An error occurred during conversion: {e}") # 如果发生错误且文件夹已生成,则删除文件夹 if os.path.exists(target_dir): print(f"Cleaning up incomplete folder: {target_dir}") shutil.rmtree(target_dir) # new def convert_pdf_to_md_new(self, pdf_dir, output_dir="output", method="auto"): pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf")) for pdf_file in pdf_files: base_name = os.path.splitext(os.path.basename(pdf_file))[0] target_dir = os.path.join(output_dir, base_name) if os.path.exists(target_dir): print(f"Folder for {pdf_file} already exists in {output_dir}. Skipping conversion.") else: command = ["mineru", "-p", pdf_file, "-o", output_dir, "-m", method] try: subprocess.run(command, check=True) print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.") except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") def batch_convert_pdfs(pdf_files, output_dir="output", method="auto", max_workers=None): # Create a process pool to run the conversion in parallel with ProcessPoolExecutor(max_workers=max_workers) as executor: # Submit each PDF file to the process pool for conversion futures = [executor.submit(convert_pdf_to_md, pdf, output_dir, method) for pdf in pdf_files] # Optionally, you can monitor the status of each future as they complete for future in futures: try: future.result() # This will raise any exceptions that occurred during the processing except Exception as exc: print(f"An error occurred during processing: {exc}") def extract_information_from_md(self, md_text): title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL) title = title_match.group(1).strip() if title_match else "N/A" authors_match = re.search( r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)', md_text, re.DOTALL ) authors = authors_match.group(1).strip() if authors_match else "N/A" abstract_match = re.search( r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)', md_text, re.DOTALL ) abstract = abstract_match.group(0).strip() if abstract_match else "N/A" abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract) abstract = re.sub(r'^[^a-zA-Z]*', '', abstract) introduction_match = re.search( r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)' r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))', md_text, re.DOTALL ) introduction = introduction_match.group(2).strip() if introduction_match else "N/A" main_content_match = re.search( r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)', md_text, re.DOTALL ) if main_content_match: main_content = main_content_match.group(1).strip() else: main_content = "N/A" extracted_data = { "title": title, "authors": authors, "abstract": abstract, "introduction": introduction, "main_content": main_content } return extracted_data def process_md_file(self, md_file_path, survey_id): loader = UnstructuredMarkdownLoader(md_file_path) data = loader.load() assert len(data) == 1, "Expected exactly one document in the markdown file." assert isinstance(data[0], Document), "The loaded data is not of type Document." extracted_text = data[0].page_content extracted_data = self.extract_information_from_md(extracted_text) if len(extracted_data["abstract"]) < 10: extracted_data["abstract"] = extracted_data['title'] title = os.path.splitext(os.path.basename(md_file_path))[0] title_new = title.strip() invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_'] for char in invalid_chars: title_new = title_new.replace(char, ' ') os.makedirs(get_path('txt', survey_id), exist_ok=True) with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f: json.dump(extracted_data, f, ensure_ascii=False, indent=4) return extracted_data['introduction'] def process_md_file_full(self, md_file_path, survey_id): loader = UnstructuredMarkdownLoader(md_file_path) data = loader.load() assert len(data) == 1, "Expected exactly one document in the markdown file." assert isinstance(data[0], Document), "The loaded data is not of type Document." extracted_text = data[0].page_content extracted_data = self.extract_information_from_md(extracted_text) if len(extracted_data["abstract"]) < 10: extracted_data["abstract"] = extracted_data['title'] title = os.path.splitext(os.path.basename(md_file_path))[0] title_new = title.strip() invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '_'] for char in invalid_chars: title_new = title_new.replace(char, ' ') os.makedirs(get_path('txt', survey_id), exist_ok=True) with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f: json.dump(extracted_data, f, ensure_ascii=False, indent=4) return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content'] def load_pdf(self, pdf_file, survey_id, mode): """ Parameters ---------- pdf_file : str 绝对路径 PDF 文件 survey_id : str 当前 survey ID,用于组织输出目录 mode : str 前端传递的模式,用于控制提取 intro 还是全文, 可能为 intro / full / auto / txt / ocr。 设计: • mineru 只支持 auto / txt / ocr,这里统一用 'auto'(或保留传入的合法值), 与前端 intro/full 概念解耦。 • read_type 控制返回介绍还是全文: - mode == 'intro' → 只返回 introduction - 其它 → 返回全文(abstract+intro+main) """ valid_mineru_methods = ['auto', 'txt', 'ocr'] if mode in valid_mineru_methods: mineru_method = mode read_type = 'full' else: mineru_method = 'auto' # 默认的 mineru 解析方式 read_type = 'intro' if mode == 'intro' else 'full' base_name = os.path.splitext(os.path.basename(pdf_file))[0] target_dir = os.path.join(get_path('md', survey_id), base_name) # mineru 会把 md 文件放到 //.md md_file_path = os.path.join(target_dir, mineru_method, f"{base_name}.md") print("The md file path is: ", md_file_path) if os.path.exists(md_file_path): print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True) if read_type == 'intro': return self.process_md_file(md_file_path, survey_id) else: return self.process_md_file_full(md_file_path, survey_id) command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mineru_method] try: subprocess.run(command, check=True) # 检查是否生成了 Markdown 文件 if not os.path.exists(md_file_path): print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...") shutil.rmtree(target_dir) # 删除生成的文件夹 return None else: print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.") if read_type == 'intro': return self.process_md_file(md_file_path, survey_id) else: return self.process_md_file_full(md_file_path, survey_id) except subprocess.CalledProcessError as e: print(f"An error occurred during conversion: {e}") # 如果发生错误且文件夹已生成,则删除文件夹 if os.path.exists(target_dir): print(f"Cleaning up incomplete folder: {target_dir}") shutil.rmtree(target_dir) return None def load_pdf_new(self, pdf_dir, survey_id): pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf")) for pdf_file in pdf_files: base_name = os.path.splitext(os.path.basename(pdf_file))[0] target_dir = os.path.join(get_path('md', survey_id), base_name) if os.path.exists(target_dir): print(f"Folder for {pdf_file} already exists in {get_path('md', survey_id)}. Skipping conversion.") else: command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", "auto"] try: subprocess.run(command, check=True) print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.") except subprocess.CalledProcessError as e: print(f"An error occurred: {e}") def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4): # Create a process pool to run the conversion in parallel with ProcessPoolExecutor(max_workers=max_workers) as executor: # Submit each PDF file to the process pool for conversion futures = [executor.submit(self.load_pdf, pdf, survey_id, "auto") for pdf in pdf_files] # Optionally, you can monitor the status of each future as they complete for future in futures: try: future.result() # This will raise any exceptions that occurred during the processing except Exception as exc: print(f"An error occurred during processing: {exc}") def ensure_non_empty_introduction(self, introduction, full_text): if len(introduction) < 50: return full_text[:1000] return introduction def extract_information_from_md_new(self, md_text): # Title extraction title_match = re.search(r'^(.*?)(\n\n|\Z)', md_text, re.DOTALL) title = title_match.group(1).strip() if title_match else "N/A" # Authors extraction authors_match = re.search( r'\n\n(.*?)(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)', md_text, re.DOTALL ) authors = authors_match.group(1).strip() if authors_match else "N/A" # Abstract extraction abstract_match = re.search( r'(\n\n[aA][\s]*[bB][\s]*[sS][\s]*[tT][\s]*[rR][\s]*[aA][\s]*[cC][\s]*[tT][^\n]*\n\n)(.*?)(\n\n|\Z)', md_text, re.DOTALL ) abstract = abstract_match.group(0).strip() if abstract_match else "N/A" abstract = re.sub(r'^[aA]\s*[bB]\s*[sS]\s*[tT]\s*[rR]\s*[aA]\s*[cC]\s*[tT][^\w]*', '', abstract) abstract = re.sub(r'^[^a-zA-Z]*', '', abstract) # Introduction extraction introduction_match = re.search( r'\n\n([1I][\.\- ]?\s*)?[Ii]\s*[nN]\s*[tT]\s*[rR]\s*[oO]\s*[dD]\s*[uU]\s*[cC]\s*[tT]\s*[iI]\s*[oO]\s*[nN][\.\- ]?\s*\n\n(.*?)' r'(?=\n\n(?:([2I][I]|\s*2)[^\n]*?\n\n|\n\n(?:[2I][I][^\n]*?\n\n)))', md_text, re.DOTALL ) introduction = introduction_match.group(2).strip() if introduction_match else "N/A" # Main content extraction main_content_match = re.search( r'(.*?)(\n\n([3I][\.\- ]?\s*)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n|\Z)', md_text, re.DOTALL ) if main_content_match: main_content = main_content_match.group(1).strip() else: main_content = "N/A" extracted_data = { "title": title, "authors": authors, "abstract": abstract, "introduction": introduction, "main_content": main_content } return extracted_data