Spaces:
Sleeping
Sleeping
File size: 6,154 Bytes
eef9e83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import base64
from pdf2image import convert_from_path
from extract_table_from_image import process_image_using_llm
from pymongo import MongoClient
from datetime import datetime
import uuid
import os
import re
import csv
import requests
from io import StringIO, BytesIO
from dotenv import load_dotenv
import boto3
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
s3 = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
pdf_temp_dir = 'temp/pdf_files'
image_temp_dir = 'temp/page_images'
os.makedirs(pdf_temp_dir, exist_ok=True)
os.makedirs(image_temp_dir, exist_ok=True)
pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
def cleanup_directory(directory_path):
try:
for filename in os.listdir(directory_path):
file_path = os.path.join(directory_path, filename)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"Cleaned up files in {directory_path}")
except Exception as e:
print(f"Error cleaning up directory {directory_path}: {e}")
def download_and_split_pdf_to_image(url):
try:
response = requests.get(url)
with open(pdf_path, 'wb') as pdf_file:
pdf_file.write(response.content)
except Exception as e:
print(f"error occured during downloading pdf from object url : {e}")
return None
try:
images = convert_from_path(pdf_path)
for i, image in enumerate(images):
image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
image.save(image_path, 'PNG')
print(f'Saved image: {image_path}')
return True
except Exception as e:
print(f"error occured in converting pdf pages to image : {e}")
return None
def upload_csv_file(file, csv_filename, content_type):
try:
# Generate a unique key for the file using UUID
uuid_str = str(uuid.uuid4())
s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
# Upload the CSV to S3
s3.upload_fileobj(
file,
AWS_BUCKET_NAME,
s3_key,
ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
)
upload_time = datetime.now()
# Metadata for MongoDB
metadata = {
'name': csv_filename,
'type': content_type,
's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
's3_key': s3_key,
'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
'date_uploaded': upload_time.strftime('%Y-%m-%d'),
'time_uploaded': upload_time.strftime('%H:%M:%S')
}
return metadata
except Exception as e:
print(f"An error occurred during upload: {e}")
return None
def process_pdf(url,filename):
split=download_and_split_pdf_to_image(url)
if split:
image_files = sorted(
os.listdir(image_temp_dir),
key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
)
table_datas= []
for count, image_name in enumerate(image_files, start=1):
print(f"Processing page {count} of the PDF")
image_path = os.path.join(image_temp_dir, image_name)
with open(image_path, "rb") as image_file:
image_data = base64.b64encode(image_file.read()).decode("utf-8")
result = process_image_using_llm(image_data,count,3)
has_table_data=result.get("has_table_data")
if has_table_data:
table_data=result.get("table_data")
page_number=result.get("page_number")
description = result.get("description")
column_summary=result.get("column_summary")
best_col1 = result.get("best_col1")
best_col2 = result.get("best_col2")
csv_buffer = StringIO()
csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
csv_writer.writeheader()
csv_writer.writerows(table_data)
csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv"
s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv")
if s3_metadata:
object_url=s3_metadata.get("object_url")
s3_url=s3_metadata.get("s3_url")
data = {
"table_data": table_data,
"description": description,
"column_summary": column_summary,
"page_number": page_number,
"csv_object_url":object_url,
"csv_s3_url":s3_url,
"best_col1": best_col1,
"best_col2": best_col2
}
table_datas.append(data)
else:
print(f"no table data found at page {count}")
if table_datas:
collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}})
cleanup_directory(pdf_temp_dir)
cleanup_directory(image_temp_dir)
return True
else:
print(f"found no table data in whole pdf")
cleanup_directory(pdf_temp_dir)
cleanup_directory(image_temp_dir)
return False
|