danial0203 commited on
Commit
52c66f2
·
verified ·
1 Parent(s): 81c8a2b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -0
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio
2
+ from tempfile import TemporaryDirectory
3
+ from pdf2image import convert_from_path
4
+ from PIL import Image
5
+ import os
6
+ from io import BytesIO
7
+ import base64
8
+ import requests
9
+ import pandas as pd
10
+ import json
11
+
12
+ os.system("apt-get update")
13
+ os.system("apt-get install poppler-utils")
14
+
15
+ # Function to convert PDF to images or open a single image
16
+ def get_images(file_path):
17
+ images = []
18
+ extension = os.path.splitext(file_path)[-1].lower()
19
+ if extension == ".pdf":
20
+ images = convert_from_path(file_path)
21
+ elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]:
22
+ image = Image.open(file_path)
23
+ images.append(image)
24
+ return images
25
+
26
+ # Function to encode image to base64
27
+ def encode_image_to_base64(image):
28
+ # Ensure the image is in a format compatible with JPEG
29
+ if image.mode in ["P", "RGBA"]:
30
+ image = image.convert("RGB")
31
+ buffered = BytesIO()
32
+ image.save(buffered, format="JPEG")
33
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
34
+
35
+ def process_files_fixed(image_path, page_identifier, error_pages):
36
+ api_key= os.getenv('OPENAI_API_KEY')
37
+ headers = {
38
+ "Content-Type": "application/json",
39
+ "Authorization": f"Bearer {api_key}"
40
+ }
41
+
42
+ # read the image and create image object
43
+ image = Image.open(image_path)
44
+ base64_image = encode_image_to_base64(image)
45
+
46
+ prompt = """Analyze the table in the provided image, focusing on the first five columns labeled S.No, Reg #, Roll No. and Marks. In case the table headers are not visible or not present, assume the mentioned order for the columns. Extract and list the data only from these columns, omitting any additional columns that may be present. But DO NOT skip any row from the table, extract all the rows present in the table.
47
+
48
+ Return the response in the following JSoN response format:
49
+ ```
50
+ {
51
+ "data": [
52
+ {
53
+ "S_No": "1",
54
+ "Reg": "059287",
55
+ "Roll_No": "2345234",
56
+ "Marks": "20"
57
+ },
58
+ {
59
+ "S_No": "2",
60
+ "Reg": "059288",
61
+ "Roll_No": "2345235",
62
+ "Marks": "25"
63
+ },
64
+ ...
65
+ ]
66
+ }
67
+ ```"""
68
+
69
+ payload = {
70
+ "model": "gpt-4-vision-preview",
71
+ "messages": [
72
+ {
73
+ "role": "user",
74
+ "content": [
75
+ {
76
+ "type": "text",
77
+ "text": prompt
78
+ },
79
+ {
80
+ "type": "image_url",
81
+ "image_url": {
82
+ "url": f"data:image/jpeg;base64,{base64_image}",
83
+ "detail": "high",
84
+ }
85
+ }
86
+ ]
87
+ }
88
+ ],
89
+ "max_tokens": 4096,
90
+ }
91
+
92
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
93
+
94
+ try:
95
+ if response.status_code == 200:
96
+ json_response = response.json()
97
+ response_content = json_response["choices"][0]["message"]["content"]
98
+
99
+ if response_content:
100
+ json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1]
101
+ json_data = json.loads(json_string)
102
+ if "data" in json_data and json_data["data"]:
103
+ return json_data["data"]
104
+ else:
105
+ print(f"No records found in page/file: {page_identifier}")
106
+ error_pages.append(page_identifier)
107
+ return []
108
+ else:
109
+ print(f"No content in JSON response for page/file: {page_identifier}")
110
+ error_pages.append(page_identifier)
111
+ return []
112
+ else:
113
+ print(f"Error in API call for page/file: {page_identifier}")
114
+ error_pages.append(page_identifier)
115
+ return []
116
+ except Exception as e:
117
+ print(f"Exception processing page/file {page_identifier}: {e}")
118
+ error_pages.append(page_identifier)
119
+ return []
120
+
121
+ error_pages = [] # Initialize the list to track error pages or files
122
+ file_path = "/content/testing 2.pdf" # Replace with your actual file path
123
+ images = get_images(file_path)
124
+ print("Total images found:", len(images))
125
+
126
+ structured_data = []
127
+ for i, image in enumerate(images, start=1):
128
+ print(f"Processing image {i}...")
129
+ with TemporaryDirectory() as temp_dir:
130
+ image_path = os.path.join(temp_dir, "image.jpg")
131
+ image.save(image_path)
132
+ data = process_files_fixed(image_path, i, error_pages) # Pass the page number or identifier
133
+ print(f"{len(data)} records found...")
134
+ structured_data.extend(data)
135
+
136
+ if structured_data:
137
+ df = pd.DataFrame(structured_data)
138
+ csv_filename = 'results.csv'
139
+ df.to_csv(csv_filename, index=False)
140
+ print(f"Results saved to {csv_filename}")
141
+ else:
142
+ print("No data to save.")
143
+
144
+ if error_pages:
145
+ print(f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}")
146
+ def process_pdf_and_generate_csv(file_path):
147
+ error_pages = [] # Initialize the list to track error pages or files
148
+ images = get_images(file_path)
149
+ structured_data = []
150
+
151
+ for i, image in enumerate(images, start=1):
152
+ with TemporaryDirectory() as temp_dir:
153
+ image_path = os.path.join(temp_dir, "image.jpg")
154
+ image.save(image_path)
155
+ data = process_files_fixed(image_path, i, error_pages)
156
+ structured_data.extend(data or [])
157
+
158
+ if structured_data:
159
+ df = pd.DataFrame(structured_data)
160
+ # Save to a temporary file to return through Gradio
161
+ tmp_file = NamedTemporaryFile(delete=False, suffix='.csv')
162
+ df.to_csv(tmp_file.name, index=False)
163
+ return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}"
164
+ else:
165
+ return None, "No data to save or an error occurred."
166
+
167
+ def gradio_interface(pdf_file):
168
+ result_csv, message = process_pdf_and_generate_csv(pdf_file.name)
169
+ if result_csv:
170
+ return result_csv, message
171
+ else:
172
+ return None, message
173
+
174
+ iface = gr.Interface(fn=gradio_interface,
175
+ inputs=gr.inputs.File(label="Please upload your PDF file"),
176
+ outputs=[gr.outputs.File(label="Download the generated CSV file"), gr.outputs.Textbox(label="Messages")],
177
+ title="PDF to CSV Table Extractor",
178
+ description="Upload a PDF file to extract tables into a CSV format.")
179
+
180
+ iface.queue().launch(share=False)