Spaces:
Runtime error
Runtime error
File size: 6,392 Bytes
7df83cd 50693de 52c66f2 86f1d14 52c66f2 6e44813 52c66f2 6e44813 52c66f2 6e44813 52c66f2 6e44813 52c66f2 f850e87 52c66f2 6e44813 52c66f2 6e44813 52c66f2 457b324 52c66f2 6e44813 52c66f2 6e44813 52c66f2 6e44813 9d29b3f 52c66f2 6e44813 52c66f2 6e44813 52c66f2 6e44813 52c66f2 6e44813 52c66f2 9558ac4 52c66f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
from tempfile import TemporaryDirectory, NamedTemporaryFile
from pdf2image import convert_from_path
from PIL import Image
import os
from io import BytesIO
import base64
import requests
import pandas as pd
import json
import logging
os.system("apt-get update")
os.system("apt-get install poppler-utils")
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
# Function to convert PDF to images or open a single image
def get_images(file_path):
images = []
extension = os.path.splitext(file_path)[-1].lower()
if extension == ".pdf":
images = convert_from_path(file_path)
elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]:
image = Image.open(file_path)
images.append(image)
return images
# Function to encode image to base64
def encode_image_to_base64(image):
# Ensure the image is in a format compatible with JPEG
if image.mode in ["P", "RGBA"]:
image = image.convert("RGB")
buffered = BytesIO()
image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def process_files_fixed(image_path, page_identifier, error_pages):
api_key = os.getenv('OPENAI_API_KEY')
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
try:
image = Image.open(image_path)
base64_image = encode_image_to_base64(image)
except Exception as e:
logging.error(f"Failed to process image at {image_path}: {e}")
error_pages.append(page_identifier)
return []
prompt = """Perform OCR on this image. Analyze the table in the provided image, focusing on the first five columns labeled S.No, Reg #, Roll No. and Marks. Get the marks from the 6th column and write them in fifth column in integers, make sure to check them as well for correct integer number, I don't want any mistakes in the obtained marks. In case the table headers are not visible or not present, assume the mentioned order for the columns. Extract and list the data only from these columns, omitting any additional columns that may be present. But DO NOT skip any row from the table, extract all the rows present in the table. The obtained marks are written in both integral and written format as well. Verify both for better ocr in integers.
Return the response in the following JSON response format:
{
"data": [
{
"S_No": "1",
"Reg": "059287",
"Roll_No": "2345234",
"Marks": "20"
},
...
]
}"""
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}"
}
]
}
],
"max_tokens": 4096
}
try:
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
logging.info(f"Full API response: {response.text}")
if response.status_code == 200:
json_response = response.json()
response_content = json_response["choices"][0]["message"]["content"]
if response_content:
try:
json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1]
json_data = json.loads(json_string)
if "data" in json_data and json_data["data"]:
return json_data["data"]
else:
logging.error(f"No records found in page/file: {page_identifier}")
error_pages.append(page_identifier)
return []
except json.JSONDecodeError:
logging.error(f"JSON parsing error in response for page/file {page_identifier}")
error_pages.append(page_identifier)
return []
else:
logging.error(f"No content in JSON response for page/file {page_identifier}")
error_pages.append(page_identifier)
return []
else:
logging.error(f"Error in API call for page/file {page_identifier}: HTTP {response.status_code} - {response.text}")
error_pages.append(page_identifier)
return []
except requests.exceptions.RequestException as e:
logging.error(f"Network or API error when processing page/file {page_identifier}: {e}")
error_pages.append(page_identifier)
return []
def process_pdf_and_generate_csv(file_path):
error_pages = [] # Initialize the list to track error pages or files
images = get_images(file_path)
structured_data = []
for i, image in enumerate(images, start=1):
with TemporaryDirectory() as temp_dir:
image_path = os.path.join(temp_dir, "image.jpg")
image.save(image_path)
data = process_files_fixed(image_path, i, error_pages)
structured_data.extend(data or [])
if structured_data:
df = pd.DataFrame(structured_data)
# Save to a temporary file to return through Gradio
tmp_file = NamedTemporaryFile(delete=False, suffix='.csv')
df.to_csv(tmp_file.name, index=False)
return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}"
else:
return None, "No data to save or an error occurred."
def gradio_interface(pdf_file):
result_csv, message = process_pdf_and_generate_csv(pdf_file.name)
if result_csv:
return result_csv, message
else:
return None, message
iface = gr.Interface(fn=gradio_interface,
inputs=gr.File(label="Please upload your PDF file"),
outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")],
title="PDF to CSV Table Extractor",
description="Upload a PDF file to extract tables into a CSV format.")
iface.queue().launch(share=False)
|