Spaces:

youdata-ai
/

MOSPI_analysis_tool

Sleeping

App Files Files Community

MOSPI_analysis_tool / process_pdf.py

akshansh36

Upload 10 files

eef9e83 verified 8 months ago

raw

history blame

6.15 kB

	import base64
	from pdf2image import convert_from_path
	from extract_table_from_image import process_image_using_llm
	from pymongo import MongoClient
	from datetime import datetime
	import uuid
	import os
	import re
	import csv
	import requests
	from io import StringIO, BytesIO
	from dotenv import load_dotenv
	import boto3

	load_dotenv()

	AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
	AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
	AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
	MONGO_URI = os.getenv("MONGO_URI")
	DB_NAME = os.getenv("DB_NAME")
	COLLECTION_NAME = os.getenv("COLLECTION_NAME")

	mongo_client = MongoClient(MONGO_URI)
	db = mongo_client[DB_NAME]
	collection = db[COLLECTION_NAME]

	s3 = boto3.client(
	's3',
	aws_access_key_id=AWS_ACCESS_KEY_ID,
	aws_secret_access_key=AWS_SECRET_ACCESS_KEY
	)
	pdf_temp_dir = 'temp/pdf_files'
	image_temp_dir = 'temp/page_images'
	os.makedirs(pdf_temp_dir, exist_ok=True)
	os.makedirs(image_temp_dir, exist_ok=True)
	pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')

	def cleanup_directory(directory_path):
	try:
	for filename in os.listdir(directory_path):
	file_path = os.path.join(directory_path, filename)
	if os.path.isfile(file_path):
	os.remove(file_path)
	print(f"Cleaned up files in {directory_path}")
	except Exception as e:
	print(f"Error cleaning up directory {directory_path}: {e}")
	def download_and_split_pdf_to_image(url):
	try:
	response = requests.get(url)
	with open(pdf_path, 'wb') as pdf_file:
	pdf_file.write(response.content)


	except Exception as e:
	print(f"error occured during downloading pdf from object url : {e}")
	return None

	try:
	images = convert_from_path(pdf_path)
	for i, image in enumerate(images):
	image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
	image.save(image_path, 'PNG')
	print(f'Saved image: {image_path}')
	return True

	except Exception as e:
	print(f"error occured in converting pdf pages to image : {e}")
	return None

	def upload_csv_file(file, csv_filename, content_type):
	try:
	# Generate a unique key for the file using UUID
	uuid_str = str(uuid.uuid4())
	s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'

	# Upload the CSV to S3
	s3.upload_fileobj(
	file,
	AWS_BUCKET_NAME,
	s3_key,
	ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
	)

	upload_time = datetime.now()

	# Metadata for MongoDB
	metadata = {
	'name': csv_filename,
	'type': content_type,
	's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
	's3_key': s3_key,
	'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
	'date_uploaded': upload_time.strftime('%Y-%m-%d'),
	'time_uploaded': upload_time.strftime('%H:%M:%S')
	}

	return metadata

	except Exception as e:
	print(f"An error occurred during upload: {e}")
	return None

	def process_pdf(url,filename):
	split=download_and_split_pdf_to_image(url)
	if split:
	image_files = sorted(
	os.listdir(image_temp_dir),
	key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
	)

	table_datas= []
	for count, image_name in enumerate(image_files, start=1):
	print(f"Processing page {count} of the PDF")
	image_path = os.path.join(image_temp_dir, image_name)
	with open(image_path, "rb") as image_file:
	image_data = base64.b64encode(image_file.read()).decode("utf-8")
	result = process_image_using_llm(image_data,count,3)

	has_table_data=result.get("has_table_data")
	if has_table_data:
	table_data=result.get("table_data")
	page_number=result.get("page_number")
	description = result.get("description")
	column_summary=result.get("column_summary")
	best_col1 = result.get("best_col1")
	best_col2 = result.get("best_col2")


	csv_buffer = StringIO()
	csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
	csv_writer.writeheader()
	csv_writer.writerows(table_data)

	csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
	csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv"
	s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv")

	if s3_metadata:
	object_url=s3_metadata.get("object_url")
	s3_url=s3_metadata.get("s3_url")
	data = {
	"table_data": table_data,
	"description": description,
	"column_summary": column_summary,
	"page_number": page_number,
	"csv_object_url":object_url,
	"csv_s3_url":s3_url,
	"best_col1": best_col1,
	"best_col2": best_col2
	}

	table_datas.append(data)

	else:
	print(f"no table data found at page {count}")

	if table_datas:
	collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}})

	cleanup_directory(pdf_temp_dir)
	cleanup_directory(image_temp_dir)
	return True

	else:
	print(f"found no table data in whole pdf")
	cleanup_directory(pdf_temp_dir)
	cleanup_directory(image_temp_dir)
	return False