Spaces:
Sleeping
Sleeping
import base64 | |
from pdf2image import convert_from_path | |
from extract_table_from_image import process_image_using_llm | |
from pymongo import MongoClient | |
from datetime import datetime | |
import uuid | |
import os | |
import re | |
import csv | |
import requests | |
from io import StringIO, BytesIO | |
from dotenv import load_dotenv | |
import boto3 | |
load_dotenv() | |
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") | |
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") | |
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") | |
MONGO_URI = os.getenv("MONGO_URI") | |
DB_NAME = os.getenv("DB_NAME") | |
COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
mongo_client = MongoClient(MONGO_URI) | |
db = mongo_client[DB_NAME] | |
collection = db[COLLECTION_NAME] | |
s3 = boto3.client( | |
's3', | |
aws_access_key_id=AWS_ACCESS_KEY_ID, | |
aws_secret_access_key=AWS_SECRET_ACCESS_KEY | |
) | |
pdf_temp_dir = 'temp/pdf_files' | |
image_temp_dir = 'temp/page_images' | |
os.makedirs(pdf_temp_dir, exist_ok=True) | |
os.makedirs(image_temp_dir, exist_ok=True) | |
pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf') | |
def cleanup_directory(directory_path): | |
try: | |
for filename in os.listdir(directory_path): | |
file_path = os.path.join(directory_path, filename) | |
if os.path.isfile(file_path): | |
os.remove(file_path) | |
print(f"Cleaned up files in {directory_path}") | |
except Exception as e: | |
print(f"Error cleaning up directory {directory_path}: {e}") | |
def download_and_split_pdf_to_image(url): | |
try: | |
response = requests.get(url) | |
with open(pdf_path, 'wb') as pdf_file: | |
pdf_file.write(response.content) | |
except Exception as e: | |
print(f"error occured during downloading pdf from object url : {e}") | |
return None | |
try: | |
images = convert_from_path(pdf_path) | |
for i, image in enumerate(images): | |
image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png') | |
image.save(image_path, 'PNG') | |
print(f'Saved image: {image_path}') | |
return True | |
except Exception as e: | |
print(f"error occured in converting pdf pages to image : {e}") | |
return None | |
def upload_csv_file(file, csv_filename, content_type): | |
try: | |
# Generate a unique key for the file using UUID | |
uuid_str = str(uuid.uuid4()) | |
s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}' | |
# Upload the CSV to S3 | |
s3.upload_fileobj( | |
file, | |
AWS_BUCKET_NAME, | |
s3_key, | |
ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file | |
) | |
upload_time = datetime.now() | |
# Metadata for MongoDB | |
metadata = { | |
'name': csv_filename, | |
'type': content_type, | |
's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}', | |
's3_key': s3_key, | |
'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}', | |
'date_uploaded': upload_time.strftime('%Y-%m-%d'), | |
'time_uploaded': upload_time.strftime('%H:%M:%S') | |
} | |
return metadata | |
except Exception as e: | |
print(f"An error occurred during upload: {e}") | |
return None | |
def process_pdf(url,filename): | |
split=download_and_split_pdf_to_image(url) | |
if split: | |
image_files = sorted( | |
os.listdir(image_temp_dir), | |
key=lambda x: int(re.search(r'page_(\d+)', x).group(1)) | |
) | |
table_datas= [] | |
for count, image_name in enumerate(image_files, start=1): | |
print(f"Processing page {count} of the PDF") | |
image_path = os.path.join(image_temp_dir, image_name) | |
with open(image_path, "rb") as image_file: | |
image_data = base64.b64encode(image_file.read()).decode("utf-8") | |
result = process_image_using_llm(image_data,count,3) | |
has_table_data=result.get("has_table_data") | |
if has_table_data: | |
table_data=result.get("table_data") | |
page_number=result.get("page_number") | |
description = result.get("description") | |
column_summary=result.get("column_summary") | |
best_col1 = result.get("best_col1") | |
best_col2 = result.get("best_col2") | |
csv_buffer = StringIO() | |
csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys()) | |
csv_writer.writeheader() | |
csv_writer.writerows(table_data) | |
csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8")) | |
csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv" | |
s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv") | |
if s3_metadata: | |
object_url=s3_metadata.get("object_url") | |
s3_url=s3_metadata.get("s3_url") | |
data = { | |
"table_data": table_data, | |
"description": description, | |
"column_summary": column_summary, | |
"page_number": page_number, | |
"csv_object_url":object_url, | |
"csv_s3_url":s3_url, | |
"best_col1": best_col1, | |
"best_col2": best_col2 | |
} | |
table_datas.append(data) | |
else: | |
print(f"no table data found at page {count}") | |
if table_datas: | |
collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}}) | |
cleanup_directory(pdf_temp_dir) | |
cleanup_directory(image_temp_dir) | |
return True | |
else: | |
print(f"found no table data in whole pdf") | |
cleanup_directory(pdf_temp_dir) | |
cleanup_directory(image_temp_dir) | |
return False | |