import base64 from pdf2image import convert_from_path from extract_table_from_image import process_image_using_llm from pymongo import MongoClient from datetime import datetime import uuid import os import re import csv import requests from io import StringIO, BytesIO from dotenv import load_dotenv import boto3 load_dotenv() AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") MONGO_URI = os.getenv("MONGO_URI") DB_NAME = os.getenv("DB_NAME") COLLECTION_NAME = os.getenv("COLLECTION_NAME") mongo_client = MongoClient(MONGO_URI) db = mongo_client[DB_NAME] collection = db[COLLECTION_NAME] s3 = boto3.client( 's3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY ) pdf_temp_dir = 'temp/pdf_files' image_temp_dir = 'temp/page_images' os.makedirs(pdf_temp_dir, exist_ok=True) os.makedirs(image_temp_dir, exist_ok=True) pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf') def cleanup_directory(directory_path): try: for filename in os.listdir(directory_path): file_path = os.path.join(directory_path, filename) if os.path.isfile(file_path): os.remove(file_path) print(f"Cleaned up files in {directory_path}") except Exception as e: print(f"Error cleaning up directory {directory_path}: {e}") def download_and_split_pdf_to_image(url): try: response = requests.get(url) with open(pdf_path, 'wb') as pdf_file: pdf_file.write(response.content) except Exception as e: print(f"error occured during downloading pdf from object url : {e}") return None try: images = convert_from_path(pdf_path) for i, image in enumerate(images): image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png') image.save(image_path, 'PNG') print(f'Saved image: {image_path}') return True except Exception as e: print(f"error occured in converting pdf pages to image : {e}") return None def upload_csv_file(file, csv_filename, content_type): try: # Generate a unique key for the file using UUID uuid_str = str(uuid.uuid4()) s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}' # Upload the CSV to S3 s3.upload_fileobj( file, AWS_BUCKET_NAME, s3_key, ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file ) upload_time = datetime.now() # Metadata for MongoDB metadata = { 'name': csv_filename, 'type': content_type, 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}', 's3_key': s3_key, 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}', 'date_uploaded': upload_time.strftime('%Y-%m-%d'), 'time_uploaded': upload_time.strftime('%H:%M:%S') } return metadata except Exception as e: print(f"An error occurred during upload: {e}") return None def process_pdf(url,filename): split=download_and_split_pdf_to_image(url) if split: image_files = sorted( os.listdir(image_temp_dir), key=lambda x: int(re.search(r'page_(\d+)', x).group(1)) ) table_datas= [] for count, image_name in enumerate(image_files, start=1): print(f"Processing page {count} of the PDF") image_path = os.path.join(image_temp_dir, image_name) with open(image_path, "rb") as image_file: image_data = base64.b64encode(image_file.read()).decode("utf-8") result = process_image_using_llm(image_data,count,3) has_table_data=result.get("has_table_data") if has_table_data: table_data=result.get("table_data") page_number=result.get("page_number") description = result.get("description") column_summary=result.get("column_summary") best_col1 = result.get("best_col1") best_col2 = result.get("best_col2") csv_buffer = StringIO() csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys()) csv_writer.writeheader() csv_writer.writerows(table_data) csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8")) csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv" s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv") if s3_metadata: object_url=s3_metadata.get("object_url") s3_url=s3_metadata.get("s3_url") data = { "table_data": table_data, "description": description, "column_summary": column_summary, "page_number": page_number, "csv_object_url":object_url, "csv_s3_url":s3_url, "best_col1": best_col1, "best_col2": best_col2 } table_datas.append(data) else: print(f"no table data found at page {count}") if table_datas: collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}}) cleanup_directory(pdf_temp_dir) cleanup_directory(image_temp_dir) return True else: print(f"found no table data in whole pdf") cleanup_directory(pdf_temp_dir) cleanup_directory(image_temp_dir) return False