Spaces:
Running
Running
S3 configuration done
Browse files- .gitignore +1 -0
- app.py +98 -44
- requirements.txt +2 -1
- s3_setup.py +44 -0
- sample.py +8 -7
.gitignore
CHANGED
@@ -19,3 +19,4 @@ test_images_folder
|
|
19 |
uploads
|
20 |
pause_space.py
|
21 |
.DS_Store
|
|
|
|
19 |
uploads
|
20 |
pause_space.py
|
21 |
.DS_Store
|
22 |
+
test_s3_client.py
|
app.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Dict
|
|
4 |
import os
|
5 |
import shutil
|
6 |
import logging
|
|
|
7 |
|
8 |
import torch
|
9 |
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
@@ -159,57 +160,98 @@ def perform_inference(file_paths: Dict[str, str]):
|
|
159 |
"cheque_file": cheque_model,
|
160 |
"gst_file": gst_model,
|
161 |
}
|
|
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
attachemnt_num = 3
|
183 |
|
184 |
-
|
185 |
-
context = pan_model
|
186 |
-
processor = processor_pan
|
187 |
-
name = "pan"
|
188 |
-
attachemnt_num = 2
|
189 |
|
190 |
-
if doc_type == "gst_file":
|
191 |
-
context = gst_model
|
192 |
-
processor = processor_gst
|
193 |
-
name = "gst"
|
194 |
-
attachemnt_num = 4
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
|
|
|
|
|
203 |
|
204 |
-
|
205 |
-
result = handle(inference_batch, context,processor,name)
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
return inference_results
|
213 |
|
214 |
# Routes
|
215 |
@app.get("/")
|
@@ -247,18 +289,30 @@ async def aadhar_ocr(
|
|
247 |
f_path = value
|
248 |
preprocessing = doc_processing(name,id_type,doc_type,f_path)
|
249 |
response = preprocessing.process()
|
250 |
-
files[key] = response["output_p"]
|
|
|
251 |
print("response",response)
|
252 |
|
253 |
|
254 |
# Perform inference
|
255 |
result = perform_inference(files)
|
256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
return {"status": "success", "result": result}
|
258 |
|
|
|
259 |
except Exception as e:
|
260 |
logging.error(f"Error processing files: {e}")
|
261 |
# raise HTTPException(status_code=500, detail="Internal Server Error")
|
262 |
-
return {
|
|
|
|
|
|
|
263 |
|
264 |
|
|
|
4 |
import os
|
5 |
import shutil
|
6 |
import logging
|
7 |
+
from s3_setup import s3_client
|
8 |
|
9 |
import torch
|
10 |
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
|
|
160 |
"cheque_file": cheque_model,
|
161 |
"gst_file": gst_model,
|
162 |
}
|
163 |
+
try:
|
164 |
+
# Dictionary to store results for each document type
|
165 |
+
inference_results = {}
|
166 |
|
167 |
+
# Loop through the file paths and perform inference
|
168 |
+
for doc_type, file_path in file_paths.items():
|
169 |
+
if doc_type in model_dirs:
|
170 |
+
print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
|
171 |
+
|
172 |
+
# Prepare batch for inference
|
173 |
+
processed_file_p = file_path.split("&&")[0]
|
174 |
+
unprocessed_file_path = file_path.split("&&")[1]
|
175 |
+
|
176 |
+
images_path = [processed_file_p]
|
177 |
+
inference_batch = prepare_batch_for_inference(images_path)
|
178 |
+
|
179 |
+
# Prepare context for the specific document type
|
180 |
+
# context = {"model_dir": model_dirs[doc_type]}
|
181 |
+
#initialize s3 client
|
182 |
+
client = s3_client()
|
183 |
+
|
184 |
+
local_file_path= unprocessed_file_path
|
185 |
+
bucket_name = "edgekycdocs"
|
|
|
186 |
|
187 |
+
file_name = unprocessed_file_path.split("/")[-1]
|
|
|
|
|
|
|
|
|
188 |
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
+
|
191 |
+
|
192 |
+
# context = aadhar_model
|
193 |
+
if doc_type == "aadhar_file":
|
194 |
+
context = aadhar_model
|
195 |
+
processor = processor_aadhar
|
196 |
+
name = "aadhar"
|
197 |
+
attachemnt_num = 3
|
198 |
+
folder_name = "aadhardocs"
|
199 |
+
|
200 |
+
|
201 |
+
if doc_type == "pan_file":
|
202 |
+
context = pan_model
|
203 |
+
processor = processor_pan
|
204 |
+
name = "pan"
|
205 |
+
attachemnt_num = 2
|
206 |
+
folder_name = "pandocs"
|
207 |
+
|
208 |
+
if doc_type == "gst_file":
|
209 |
+
context = gst_model
|
210 |
+
processor = processor_gst
|
211 |
+
name = "gst"
|
212 |
+
attachemnt_num = 4
|
213 |
+
folder_name = "gstdocs"
|
214 |
+
|
215 |
+
if doc_type == "cheque_file":
|
216 |
+
context = cheque_model
|
217 |
+
processor = processor_cheque
|
218 |
+
name = "cheque"
|
219 |
+
attachemnt_num = 8
|
220 |
+
folder_name = "bankchequedocs"
|
221 |
+
|
222 |
|
223 |
|
224 |
+
# upload the document to s3 bucket here
|
225 |
+
|
226 |
|
227 |
+
response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
|
|
|
228 |
|
229 |
+
print("The file has been uploaded to s3 bucket",response)
|
230 |
+
|
231 |
+
|
232 |
+
# Perform inference (replace `handle` with your actual function)
|
233 |
+
result = handle(inference_batch, context,processor,name)
|
234 |
+
# result["attachment_url": response["url"]]
|
235 |
+
result["attachment_url"] = response["url"]
|
236 |
+
result["detect"] = True
|
237 |
+
|
238 |
+
print("result required",result)
|
239 |
+
|
240 |
+
# if result[""]
|
241 |
+
|
242 |
+
# Store the result
|
243 |
+
inference_results["attachment_{}".format(attachemnt_num)] = result
|
244 |
+
else:
|
245 |
+
print(f"Model directory not found for {doc_type}. Skipping.")
|
246 |
+
# print(Javed)
|
247 |
+
|
248 |
+
return inference_results
|
249 |
+
except:
|
250 |
+
return {
|
251 |
+
"status": "error",
|
252 |
+
"message": "Text extraction failed."
|
253 |
+
}
|
254 |
|
|
|
255 |
|
256 |
# Routes
|
257 |
@app.get("/")
|
|
|
289 |
f_path = value
|
290 |
preprocessing = doc_processing(name,id_type,doc_type,f_path)
|
291 |
response = preprocessing.process()
|
292 |
+
files[key] = response["output_p"] + "&&" + f_path
|
293 |
+
# files["unprocessed_file_path"] = f_path
|
294 |
print("response",response)
|
295 |
|
296 |
|
297 |
# Perform inference
|
298 |
result = perform_inference(files)
|
299 |
|
300 |
+
print("this is the result we got",result)
|
301 |
+
if "status" in list(result.keys()):
|
302 |
+
raise Exception("Custom error message")
|
303 |
+
# if result["status"] == "error":
|
304 |
+
|
305 |
+
|
306 |
+
|
307 |
return {"status": "success", "result": result}
|
308 |
|
309 |
+
|
310 |
except Exception as e:
|
311 |
logging.error(f"Error processing files: {e}")
|
312 |
# raise HTTPException(status_code=500, detail="Internal Server Error")
|
313 |
+
return {
|
314 |
+
"status": 400,
|
315 |
+
"message": "Text extraction failed."
|
316 |
+
}
|
317 |
|
318 |
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ pillow
|
|
8 |
google-cloud-vision
|
9 |
python-dotenv
|
10 |
pymupdf
|
11 |
-
pillow
|
|
|
|
8 |
google-cloud-vision
|
9 |
python-dotenv
|
10 |
pymupdf
|
11 |
+
pillow
|
12 |
+
boto3
|
s3_setup.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
# AWS credentials (if not set in environment variables or AWS CLI config)
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
|
7 |
+
from utils import doc_processing
|
8 |
+
|
9 |
+
# Load .env file
|
10 |
+
load_dotenv()
|
11 |
+
# Access variables
|
12 |
+
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
13 |
+
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
14 |
+
print("AWS_ACCESS_KEY_ID",AWS_ACCESS_KEY_ID)
|
15 |
+
print("AWS_SECRET_ACCESS_KEY",AWS_SECRET_ACCESS_KEY)
|
16 |
+
# Initialize S3 client
|
17 |
+
|
18 |
+
class s3_client:
|
19 |
+
def __init__(self):
|
20 |
+
self.aws_access_key_id = AWS_ACCESS_KEY_ID
|
21 |
+
self.aws_secret_access_key = AWS_SECRET_ACCESS_KEY
|
22 |
+
|
23 |
+
def initialize(self):
|
24 |
+
return boto3.client(
|
25 |
+
's3',
|
26 |
+
aws_access_key_id=self.aws_access_key_id,
|
27 |
+
aws_secret_access_key=self.aws_secret_access_key
|
28 |
+
)
|
29 |
+
|
30 |
+
def upload_file(self,local_file_path, bucket_name,folder_name,file_name):
|
31 |
+
try:
|
32 |
+
client = self.initialize()
|
33 |
+
client.upload_file(local_file_path, bucket_name, f"{folder_name}/{file_name}")
|
34 |
+
print(f"File uploaded successfully to {bucket_name}/{folder_name}{file_name}")
|
35 |
+
url = f"https://edgekycdocs.s3.eu-north-1.amazonaws.com/{folder_name}/{file_name}"
|
36 |
+
print("file url",url)
|
37 |
+
return {"status": 200, "message":"file uploaded successfully" , "url" : url}
|
38 |
+
except Exception as e:
|
39 |
+
print("Error uploading file:", e)
|
40 |
+
return {"status": 400, "message":e}
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
|
sample.py
CHANGED
@@ -2,17 +2,18 @@ import requests
|
|
2 |
import sys
|
3 |
|
4 |
# Define the API endpoint
|
5 |
-
# url = "http://
|
6 |
-
test_url = "http://
|
7 |
|
8 |
-
response = requests.get(
|
9 |
|
10 |
-
post_url = "http://127.0.0.1:7860/api/aadhar_ocr"
|
11 |
-
print("Status Code:", response.status_code)
|
12 |
-
print("Response Text:", response.text)
|
13 |
|
14 |
-
|
|
|
|
|
|
|
15 |
|
|
|
16 |
|
17 |
# response = requests.get(url)
|
18 |
|
|
|
2 |
import sys
|
3 |
|
4 |
# Define the API endpoint
|
5 |
+
# url = "http://localhost:7680/"
|
6 |
+
# # test_url = "http://localhost:7860/"
|
7 |
|
8 |
+
# response = requests.get(url)
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
+
# print("Status Code:", response.status_code)
|
12 |
+
# print("Response Text:", response.text)
|
13 |
+
|
14 |
+
# sys.exit()
|
15 |
|
16 |
+
post_url = "http://localhost:7680/api/aadhar_ocr"
|
17 |
|
18 |
# response = requests.get(url)
|
19 |
|