Spaces:

technicolor
/

InteractiveSurvey

Sleeping

App Files Files Community

InteractiveSurvey / src /demo /asg_loader.py

technicolor

update

92d8c87 6 days ago

raw

history blame

13 kB

	import os
	import re
	import json
	import subprocess
	import glob
	from pathlib import Path
	from concurrent.futures import ProcessPoolExecutor
	from langchain_community.document_loaders import UnstructuredMarkdownLoader
	from langchain.schema import Document
	import shutil
	import tempfile
	from .path_utils import get_path

	class DocumentLoading:
	def convert_pdf_to_md(self, pdf_file, output_dir="output", method="auto"):
	base_name = os.path.splitext(os.path.basename(pdf_file))[0]
	target_dir = os.path.join(output_dir, base_name)
	md_file_path = os.path.join(target_dir, method, f"{base_name}.md")
	print("The md file path is: ", md_file_path)

	if os.path.exists(md_file_path):
	print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
	return

	command = ["mineru", "-p", pdf_file, "-o", output_dir, "-m", method]
	try:
	subprocess.run(command, check=True)
	# 检查是否生成了 Markdown 文件
	if not os.path.exists(md_file_path):
	print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
	shutil.rmtree(target_dir) # 删除生成的文件夹
	else:
	print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred during conversion: {e}")
	# 如果发生错误且文件夹已生成，则删除文件夹
	if os.path.exists(target_dir):
	print(f"Cleaning up incomplete folder: {target_dir}")
	shutil.rmtree(target_dir)
	# new
	def convert_pdf_to_md_new(self, pdf_dir, output_dir="output", method="auto"):
	pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

	for pdf_file in pdf_files:
	base_name = os.path.splitext(os.path.basename(pdf_file))[0]
	target_dir = os.path.join(output_dir, base_name)

	if os.path.exists(target_dir):
	print(f"Folder for {pdf_file} already exists in {output_dir}. Skipping conversion.")
	else:
	command = ["mineru", "-p", pdf_file, "-o", output_dir, "-m", method]
	try:
	subprocess.run(command, check=True)
	print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")

	def batch_convert_pdfs(pdf_files, output_dir="output", method="auto", max_workers=None):
	# Create a process pool to run the conversion in parallel
	with ProcessPoolExecutor(max_workers=max_workers) as executor:
	# Submit each PDF file to the process pool for conversion
	futures = [executor.submit(convert_pdf_to_md, pdf, output_dir, method) for pdf in pdf_files]

	# Optionally, you can monitor the status of each future as they complete
	for future in futures:
	try:
	future.result() # This will raise any exceptions that occurred during the processing
	except Exception as exc:
	print(f"An error occurred during processing: {exc}")

	def extract_information_from_md(self, md_text):
	title_match = re.search(r'^(.*?)(\n\n\|\Z)', md_text, re.DOTALL)
	title = title_match.group(1).strip() if title_match else "N/A"

	authors_match = re.search(
	r'\n\n(.?)(\n\n[aA][\s][bB][\s][sS][\s][tT][\s][rR][\s][aA][\s][cC][\s][tT][^\n]*\n\n)',
	md_text,
	re.DOTALL
	)
	authors = authors_match.group(1).strip() if authors_match else "N/A"

	abstract_match = re.search(
	r'(\n\n[aA][\s][bB][\s][sS][\s][tT][\s][rR][\s][aA][\s][cC][\s][tT][^\n]\n\n)(.*?)(\n\n\|\Z)',
	md_text,
	re.DOTALL
	)
	abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
	abstract = re.sub(r'^[aA]\s[bB]\s[sS]\s[tT]\s[rR]\s[aA]\s[cC]\s[tT][^\w]', '', abstract)
	abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)

	introduction_match = re.search(
	r'\n\n([1I][\.\- ]?\s)?[Ii]\s[nN]\s[tT]\s[rR]\s[oO]\s[dD]\s[uU]\s[cC]\s[tT]\s[iI]\s[oO]\s[nN][\.\- ]?\s\n\n(.?)'
	r'(?=\n\n(?:([2I][I]\|\s2)[^\n]?\n\n\|\n\n(?:[2I][I][^\n]*?\n\n)))',
	md_text,
	re.DOTALL
	)
	introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

	main_content_match = re.search(
	r'(.?)(\n\n([3I][\.\- ]?\s)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n\|\Z)',
	md_text,
	re.DOTALL
	)

	if main_content_match:
	main_content = main_content_match.group(1).strip()
	else:
	main_content = "N/A"

	extracted_data = {
	"title": title,
	"authors": authors,
	"abstract": abstract,
	"introduction": introduction,
	"main_content": main_content
	}
	return extracted_data

	def process_md_file(self, md_file_path, survey_id):
	loader = UnstructuredMarkdownLoader(md_file_path)
	data = loader.load()
	assert len(data) == 1, "Expected exactly one document in the markdown file."
	assert isinstance(data[0], Document), "The loaded data is not of type Document."
	extracted_text = data[0].page_content

	extracted_data = self.extract_information_from_md(extracted_text)
	if len(extracted_data["abstract"]) < 10:
	extracted_data["abstract"] = extracted_data['title']

	title = os.path.splitext(os.path.basename(md_file_path))[0]
	title_new = title.strip()
	invalid_chars = ['<', '>', ':', '"', '/', '\\', '\|', '?', '*', '_']
	for char in invalid_chars:
	title_new = title_new.replace(char, ' ')

	os.makedirs(get_path('txt', survey_id), exist_ok=True)
	with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
	json.dump(extracted_data, f, ensure_ascii=False, indent=4)
	return extracted_data['introduction']

	def process_md_file_full(self, md_file_path, survey_id):
	loader = UnstructuredMarkdownLoader(md_file_path)
	data = loader.load()
	assert len(data) == 1, "Expected exactly one document in the markdown file."
	assert isinstance(data[0], Document), "The loaded data is not of type Document."
	extracted_text = data[0].page_content

	extracted_data = self.extract_information_from_md(extracted_text)
	if len(extracted_data["abstract"]) < 10:
	extracted_data["abstract"] = extracted_data['title']

	title = os.path.splitext(os.path.basename(md_file_path))[0]
	title_new = title.strip()
	invalid_chars = ['<', '>', ':', '"', '/', '\\', '\|', '?', '*', '_']
	for char in invalid_chars:
	title_new = title_new.replace(char, ' ')

	os.makedirs(get_path('txt', survey_id), exist_ok=True)
	with open(get_path('txt', survey_id, f'{title_new}.json'), 'w', encoding='utf-8') as f:
	json.dump(extracted_data, f, ensure_ascii=False, indent=4)
	return extracted_data['abstract'] + extracted_data['introduction'] + extracted_data['main_content']

	def load_pdf(self, pdf_file, survey_id, mode):
	base_name = os.path.splitext(os.path.basename(pdf_file))[0]
	target_dir = os.path.join(get_path('md', survey_id), base_name)
	md_file_path = os.path.join(target_dir, mode, f"{base_name}.md")
	print("The md file path is: ", md_file_path)

	if os.path.exists(md_file_path):
	print(f"Markdown file for {pdf_file} already exists at {md_file_path}. Skipping conversion.", flush=True)
	return self.process_md_file(md_file_path, survey_id)

	command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", mode]
	try:
	subprocess.run(command, check=True)
	# 检查是否生成了 Markdown 文件
	if not os.path.exists(md_file_path):
	print(f"Conversion failed: Markdown file not found at {md_file_path}. Cleaning up folder...")
	shutil.rmtree(target_dir) # 删除生成的文件夹
	return None
	else:
	print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
	return self.process_md_file(md_file_path, survey_id)
	except subprocess.CalledProcessError as e:
	print(f"An error occurred during conversion: {e}")
	# 如果发生错误且文件夹已生成，则删除文件夹
	if os.path.exists(target_dir):
	print(f"Cleaning up incomplete folder: {target_dir}")
	shutil.rmtree(target_dir)
	return None

	def load_pdf_new(self, pdf_dir, survey_id):
	pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

	for pdf_file in pdf_files:
	base_name = os.path.splitext(os.path.basename(pdf_file))[0]
	target_dir = os.path.join(get_path('md', survey_id), base_name)

	if os.path.exists(target_dir):
	print(f"Folder for {pdf_file} already exists in {get_path('md', survey_id)}. Skipping conversion.")
	else:
	command = ["mineru", "-p", pdf_file, "-o", get_path('md', survey_id), "-m", "auto"]
	try:
	subprocess.run(command, check=True)
	print(f"Successfully converted {pdf_file} to markdown format in {target_dir}.")
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")

	def parallel_load_pdfs(self, pdf_files, survey_id, max_workers=4):
	# Create a process pool to run the conversion in parallel
	with ProcessPoolExecutor(max_workers=max_workers) as executor:
	# Submit each PDF file to the process pool for conversion
	futures = [executor.submit(self.load_pdf, pdf, survey_id, "auto") for pdf in pdf_files]

	# Optionally, you can monitor the status of each future as they complete
	for future in futures:
	try:
	future.result() # This will raise any exceptions that occurred during the processing
	except Exception as exc:
	print(f"An error occurred during processing: {exc}")

	def ensure_non_empty_introduction(self, introduction, full_text):
	if len(introduction) < 50:
	return full_text[:1000]
	return introduction

	def extract_information_from_md_new(self, md_text):
	# Title extraction
	title_match = re.search(r'^(.*?)(\n\n\|\Z)', md_text, re.DOTALL)
	title = title_match.group(1).strip() if title_match else "N/A"

	# Authors extraction
	authors_match = re.search(
	r'\n\n(.?)(\n\n[aA][\s][bB][\s][sS][\s][tT][\s][rR][\s][aA][\s][cC][\s][tT][^\n]*\n\n)',
	md_text,
	re.DOTALL
	)
	authors = authors_match.group(1).strip() if authors_match else "N/A"

	# Abstract extraction
	abstract_match = re.search(
	r'(\n\n[aA][\s][bB][\s][sS][\s][tT][\s][rR][\s][aA][\s][cC][\s][tT][^\n]\n\n)(.*?)(\n\n\|\Z)',
	md_text,
	re.DOTALL
	)
	abstract = abstract_match.group(0).strip() if abstract_match else "N/A"
	abstract = re.sub(r'^[aA]\s[bB]\s[sS]\s[tT]\s[rR]\s[aA]\s[cC]\s[tT][^\w]', '', abstract)
	abstract = re.sub(r'^[^a-zA-Z]*', '', abstract)

	# Introduction extraction
	introduction_match = re.search(
	r'\n\n([1I][\.\- ]?\s)?[Ii]\s[nN]\s[tT]\s[rR]\s[oO]\s[dD]\s[uU]\s[cC]\s[tT]\s[iI]\s[oO]\s[nN][\.\- ]?\s\n\n(.?)'
	r'(?=\n\n(?:([2I][I]\|\s2)[^\n]?\n\n\|\n\n(?:[2I][I][^\n]*?\n\n)))',
	md_text,
	re.DOTALL
	)
	introduction = introduction_match.group(2).strip() if introduction_match else "N/A"

	# Main content extraction
	main_content_match = re.search(
	r'(.?)(\n\n([3I][\.\- ]?\s)?[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss][^\n]*\n\n\|\Z)',
	md_text,
	re.DOTALL
	)

	if main_content_match:
	main_content = main_content_match.group(1).strip()
	else:
	main_content = "N/A"

	extracted_data = {
	"title": title,
	"authors": authors,
	"abstract": abstract,
	"introduction": introduction,
	"main_content": main_content
	}
	return extracted_data