File size: 5,596 Bytes
7df83cd
50693de
52c66f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9558ac4
 
52c66f2
 
 
2c52ed6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import gradio as gr
from tempfile import TemporaryDirectory, NamedTemporaryFile
from pdf2image import convert_from_path
from PIL import Image
import os
from io import BytesIO
import base64
import requests
import pandas as pd
import json

os.system("apt-get update")
os.system("apt-get install poppler-utils")

# Function to convert PDF to images or open a single image
def get_images(file_path):
    images = []
    extension = os.path.splitext(file_path)[-1].lower()
    if extension == ".pdf":
        images = convert_from_path(file_path)
    elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]:
        image = Image.open(file_path)
        images.append(image)
    return images

# Function to encode image to base64
def encode_image_to_base64(image):
    # Ensure the image is in a format compatible with JPEG
    if image.mode in ["P", "RGBA"]:
        image = image.convert("RGB")
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

def process_files_fixed(image_path, page_identifier, error_pages):
    api_key= os.getenv('OPENAI_API_KEY')
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    # read the image and create image object
    image = Image.open(image_path)
    base64_image = encode_image_to_base64(image)

    prompt = """Analyze the table in the provided image, focusing on the first five columns labeled S.No, Reg #, Roll No. and Marks. In case the table headers are not visible or not present, assume the mentioned order for the columns. Extract and list the data only from these columns, omitting any additional columns that may be present. But DO NOT skip any row from the table, extract all the rows present in the table.

Return the response in the following JSoN response format:
```
{
    "data": [
        {
            "S_No": "1",
            "Reg": "059287",
            "Roll_No": "2345234",
            "Marks": "20"
        },
        {
            "S_No": "2",
            "Reg": "059288",
            "Roll_No": "2345235",
            "Marks": "25"
        },
        ...
    ]
}
```"""

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                            "detail": "high",
                        }
                    }
                ]
            }
        ],
        "max_tokens": 4096,
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    try:
        if response.status_code == 200:
            json_response = response.json()
            response_content = json_response["choices"][0]["message"]["content"]

            if response_content:
                json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1]
                json_data = json.loads(json_string)
                if "data" in json_data and json_data["data"]:
                    return json_data["data"]
                else:
                    print(f"No records found in page/file: {page_identifier}")
                    error_pages.append(page_identifier)
                    return []
            else:
                print(f"No content in JSON response for page/file: {page_identifier}")
                error_pages.append(page_identifier)
                return []
        else:
            print(f"Error in API call for page/file: {page_identifier}")
            error_pages.append(page_identifier)
            return []
    except Exception as e:
        print(f"Exception processing page/file {page_identifier}: {e}")
        error_pages.append(page_identifier)
        return []

def process_pdf_and_generate_csv(file_path):
    error_pages = []  # Initialize the list to track error pages or files
    images = get_images(file_path)
    structured_data = []
    
    for i, image in enumerate(images, start=1):
        with TemporaryDirectory() as temp_dir:
            image_path = os.path.join(temp_dir, "image.jpg")
            image.save(image_path)
            data = process_files_fixed(image_path, i, error_pages)
            structured_data.extend(data or [])
    
    if structured_data:
        df = pd.DataFrame(structured_data)
        # Save to a temporary file to return through Gradio
        tmp_file = NamedTemporaryFile(delete=False, suffix='.csv')
        df.to_csv(tmp_file.name, index=False)
        return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}"
    else:
        return None, "No data to save or an error occurred."

def gradio_interface(pdf_file):
    result_csv, message = process_pdf_and_generate_csv(pdf_file.name)
    if result_csv:
        return result_csv, message
    else:
        return None, message

iface = gr.Interface(fn=gradio_interface,
                     inputs=gr.File(label="Please upload your PDF file"),
                     outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")],
                     title="PDF to CSV Table Extractor",
                     description="Upload a PDF file to extract tables into a CSV format.")

iface.launch(share=False)