MOSPI_analysis_tool / process_pdf.py
akshansh36's picture
Upload 10 files
eef9e83 verified
raw
history blame
6.15 kB
import base64
from pdf2image import convert_from_path
from extract_table_from_image import process_image_using_llm
from pymongo import MongoClient
from datetime import datetime
import uuid
import os
import re
import csv
import requests
from io import StringIO, BytesIO
from dotenv import load_dotenv
import boto3
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
s3 = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
pdf_temp_dir = 'temp/pdf_files'
image_temp_dir = 'temp/page_images'
os.makedirs(pdf_temp_dir, exist_ok=True)
os.makedirs(image_temp_dir, exist_ok=True)
pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
def cleanup_directory(directory_path):
try:
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"Cleaned up files in {directory_path}")
except Exception as e:
print(f"Error cleaning up directory {directory_path}: {e}")
def download_and_split_pdf_to_image(url):
try:
response = requests.get(url)
with open(pdf_path, 'wb') as pdf_file:
pdf_file.write(response.content)
except Exception as e:
print(f"error occured during downloading pdf from object url : {e}")
return None
try:
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
image.save(image_path, 'PNG')
print(f'Saved image: {image_path}')
return True
except Exception as e:
print(f"error occured in converting pdf pages to image : {e}")
return None
def upload_csv_file(file, csv_filename, content_type):
try:
# Generate a unique key for the file using UUID
uuid_str = str(uuid.uuid4())
s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
# Upload the CSV to S3
s3.upload_fileobj(
file,
AWS_BUCKET_NAME,
s3_key,
ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
)
upload_time = datetime.now()
# Metadata for MongoDB
metadata = {
'name': csv_filename,
'type': content_type,
's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
's3_key': s3_key,
'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
'date_uploaded': upload_time.strftime('%Y-%m-%d'),
'time_uploaded': upload_time.strftime('%H:%M:%S')
}
return metadata
except Exception as e:
print(f"An error occurred during upload: {e}")
return None
def process_pdf(url,filename):
split=download_and_split_pdf_to_image(url)
if split:
image_files = sorted(
os.listdir(image_temp_dir),
key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
)
table_datas= []
for count, image_name in enumerate(image_files, start=1):
print(f"Processing page {count} of the PDF")
image_path = os.path.join(image_temp_dir, image_name)
with open(image_path, "rb") as image_file:
image_data = base64.b64encode(image_file.read()).decode("utf-8")
result = process_image_using_llm(image_data,count,3)
has_table_data=result.get("has_table_data")
if has_table_data:
table_data=result.get("table_data")
page_number=result.get("page_number")
description = result.get("description")
column_summary=result.get("column_summary")
best_col1 = result.get("best_col1")
best_col2 = result.get("best_col2")
csv_buffer = StringIO()
csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
csv_writer.writeheader()
csv_writer.writerows(table_data)
csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv"
s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv")
if s3_metadata:
object_url=s3_metadata.get("object_url")
s3_url=s3_metadata.get("s3_url")
data = {
"table_data": table_data,
"description": description,
"column_summary": column_summary,
"page_number": page_number,
"csv_object_url":object_url,
"csv_s3_url":s3_url,
"best_col1": best_col1,
"best_col2": best_col2
}
table_datas.append(data)
else:
print(f"no table data found at page {count}")
if table_datas:
collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}})
cleanup_directory(pdf_temp_dir)
cleanup_directory(image_temp_dir)
return True
else:
print(f"found no table data in whole pdf")
cleanup_directory(pdf_temp_dir)
cleanup_directory(image_temp_dir)
return False