AuditEdge commited on
Commit
3177dbb
·
1 Parent(s): e709d2a

added other api

Browse files
Files changed (1) hide show
  1. app.py +472 -141
app.py CHANGED
@@ -1,17 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from typing import Dict
4
  import os
5
  import shutil
 
6
  import logging
7
  from s3_setup import s3_client
8
-
9
- import torch
10
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
11
-
12
  from dotenv import load_dotenv
13
- import os
14
-
15
  from utils import doc_processing
16
 
17
  # Load .env file
@@ -21,7 +352,6 @@ load_dotenv()
21
  dummy_key = os.getenv("dummy_key")
22
  HUGGINGFACE_AUTH_TOKEN = dummy_key
23
 
24
-
25
  # Hugging Face model and token
26
  aadhar_model = "AuditEdge/doc_ocr_a" # Replace with your fine-tuned model if applicable
27
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -29,12 +359,10 @@ print(f"Using device: {device}")
29
 
30
  # Load the processor (tokenizer + image processor)
31
  processor_aadhar = LayoutLMv3Processor.from_pretrained(
32
- aadhar_model,
33
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
34
  )
35
  aadhar_model = LayoutLMv3ForTokenClassification.from_pretrained(
36
- aadhar_model,
37
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
38
  )
39
 
40
 
@@ -46,57 +374,50 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
  print(f"Using device: {device}")
47
 
48
 
49
-
50
  # Load the processor (tokenizer + image processor)
51
  processor_pan = LayoutLMv3Processor.from_pretrained(
52
- pan_model,
53
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
54
  )
55
  pan_model = LayoutLMv3ForTokenClassification.from_pretrained(
56
- pan_model,
57
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
58
  )
59
  pan_model = pan_model.to(device)
60
 
61
  #
62
  # gst model
63
- gst_model = "AuditEdge/doc_ocr_new_g" # Replace with your fine-tuned model if applicable
 
 
64
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
65
  print(f"Using device: {device}")
66
 
67
  # Load the processor (tokenizer + image processor)
68
  processor_gst = LayoutLMv3Processor.from_pretrained(
69
- gst_model,
70
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
71
  )
72
  gst_model = LayoutLMv3ForTokenClassification.from_pretrained(
73
- gst_model,
74
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
75
  )
76
  gst_model = gst_model.to(device)
77
 
78
- #cheque model
79
 
80
- cheque_model = "AuditEdge/doc_ocr_new_c" # Replace with your fine-tuned model if applicable
 
 
81
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
  print(f"Using device: {device}")
83
 
84
  # Load the processor (tokenizer + image processor)
85
  processor_cheque = LayoutLMv3Processor.from_pretrained(
86
- cheque_model,
87
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
88
  )
89
  cheque_model = LayoutLMv3ForTokenClassification.from_pretrained(
90
- cheque_model,
91
- use_auth_token=HUGGINGFACE_AUTH_TOKEN
92
  )
93
  cheque_model = cheque_model.to(device)
94
 
95
 
96
-
97
-
98
-
99
-
100
  # Verify model and processor are loaded
101
  print("Model and processor loaded successfully!")
102
  print(f"Model is on device: {next(aadhar_model.parameters()).device}")
@@ -119,140 +440,95 @@ app.add_middleware(
119
  )
120
 
121
  # Configure directories
122
- UPLOAD_FOLDER = './uploads/'
123
  processing_folder = "./processed_images"
124
  os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Ensure the main upload folder exists
125
- os.makedirs(processing_folder,exist_ok=True)
 
126
 
127
  UPLOAD_DIRS = {
128
- "aadhar_file": "uploads/aadhar/",
129
  "pan_file": "uploads/pan/",
130
- "cheque_file": "uploads/cheque/",
131
  "gst_file": "uploads/gst/",
 
 
 
132
  }
133
 
 
134
  process_dirs = {
135
  "aadhar_file": "processed_images/aadhar/",
136
  "pan_file": "processed_images/pan/",
137
  "cheque_file": "processed_images/cheque/",
138
  "gst_file": "processed_images/gst/",
139
-
140
  }
141
 
142
  # Ensure individual directories exist
143
  for dir_path in UPLOAD_DIRS.values():
144
  os.makedirs(dir_path, exist_ok=True)
145
-
146
  for dir_path in process_dirs.values():
147
  os.makedirs(dir_path, exist_ok=True)
148
-
149
-
150
 
151
  # Logger configuration
152
  logging.basicConfig(level=logging.INFO)
153
 
154
- # Perform Inference
155
- def perform_inference(file_paths: Dict[str, str]):
156
- # Dictionary to map document types to their respective model directories
157
  model_dirs = {
158
- "aadhar_file": aadhar_model,
159
  "pan_file": pan_model,
160
- "cheque_file": cheque_model,
161
  "gst_file": gst_model,
 
162
  }
163
- try:
164
- # Dictionary to store results for each document type
165
  inference_results = {}
166
 
167
- # Loop through the file paths and perform inference
168
  for doc_type, file_path in file_paths.items():
169
  if doc_type in model_dirs:
170
  print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
171
 
172
- # Prepare batch for inference
173
  processed_file_p = file_path.split("&&")[0]
174
  unprocessed_file_path = file_path.split("&&")[1]
175
-
176
  images_path = [processed_file_p]
177
  inference_batch = prepare_batch_for_inference(images_path)
178
 
179
- # Prepare context for the specific document type
180
- # context = {"model_dir": model_dirs[doc_type]}
181
- #initialize s3 client
182
- client = s3_client()
183
-
184
- local_file_path= unprocessed_file_path
185
- bucket_name = "edgekycdocs"
186
-
187
- file_name = unprocessed_file_path.split("/")[-1]
188
-
189
-
190
-
191
-
192
- # context = aadhar_model
193
- if doc_type == "aadhar_file":
194
- context = aadhar_model
195
- processor = processor_aadhar
196
- name = "aadhar"
197
- attachemnt_num = 3
198
- folder_name = "aadhardocs"
199
-
200
-
201
- if doc_type == "pan_file":
202
- context = pan_model
203
- processor = processor_pan
204
- name = "pan"
205
- attachemnt_num = 2
206
- folder_name = "pandocs"
207
-
208
- if doc_type == "gst_file":
209
- context = gst_model
210
- processor = processor_gst
211
- name = "gst"
212
- attachemnt_num = 4
213
- folder_name = "gstdocs"
214
-
215
- if doc_type == "cheque_file":
216
- context = cheque_model
217
- processor = processor_cheque
218
- name = "cheque"
219
- attachemnt_num = 8
220
- folder_name = "bankchequedocs"
221
-
222
-
223
-
224
- # upload the document to s3 bucket here
225
-
226
-
227
- print("this is folder name",folder_name)
228
-
229
- response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
230
-
231
- print("The file has been uploaded to s3 bucket",response)
232
-
233
-
234
- # Perform inference (replace `handle` with your actual function)
235
- result = handle(inference_batch, context,processor,name)
236
- # result["attachment_url": response["url"]]
237
- result["attachment_url"] = response["url"]
238
  result["detect"] = True
239
 
240
- print("result required",result)
241
-
242
- # if result[""]
243
-
244
- # Store the result
245
- inference_results["attachment_{}".format(attachemnt_num)] = result
246
  else:
247
  print(f"Model directory not found for {doc_type}. Skipping.")
248
- # print(Javed)
249
 
250
- return inference_results
251
  except:
252
- return {
253
- "status": "error",
254
- "message": "Text extraction failed."
255
- }
256
 
257
 
258
  # Routes
@@ -260,15 +536,19 @@ def perform_inference(file_paths: Dict[str, str]):
260
  def greet_json():
261
  return {"Hello": "World!"}
262
 
 
263
  @app.post("/api/aadhar_ocr")
264
  async def aadhar_ocr(
265
  aadhar_file: UploadFile = File(None),
266
  pan_file: UploadFile = File(None),
267
  cheque_file: UploadFile = File(None),
268
  gst_file: UploadFile = File(None),
 
 
 
269
  ):
270
  # try:
271
- # Handle file uploads
272
  file_paths = {}
273
  for file_type, folder in UPLOAD_DIRS.items():
274
  file = locals()[file_type] # Dynamically access the file arguments
@@ -276,15 +556,15 @@ async def aadhar_ocr(
276
  # Save the file in the respective directory
277
  file_path = os.path.join(folder, file.filename)
278
 
279
- print("this is the filename",file.filename)
280
  with open(file_path, "wb") as buffer:
281
  shutil.copyfileobj(file.file, buffer)
282
  file_paths[file_type] = file_path
283
 
284
  # Log received files
285
  logging.info(f"Received files: {list(file_paths.keys())}")
286
- print("file_paths",file_paths)
287
-
288
  files = {}
289
  for key, value in file_paths.items():
290
  name = value.split("/")[-1].split(".")[0]
@@ -292,36 +572,87 @@ async def aadhar_ocr(
292
  doc_type = value.split("/")[-1].split(".")[-1]
293
  f_path = value
294
 
295
- print("variables required",name,id_type,doc_type,f_path)
296
- preprocessing = doc_processing(name,id_type,doc_type,f_path)
297
  response = preprocessing.process()
298
 
299
- print("response after preprocessing",response)
300
 
301
  files[key] = response["output_p"] + "&&" + f_path
302
  # files["unprocessed_file_path"] = f_path
303
- print("response",response)
304
 
305
-
306
  # Perform inference
307
- result = perform_inference(files)
308
 
309
- print("this is the result we got",result)
310
  if "status" in list(result.keys()):
311
  raise Exception("Custom error message")
312
  # if result["status"] == "error":
313
-
314
-
315
 
316
  return {"status": "success", "result": result}
317
 
318
 
319
- # except Exception as e:
320
- # logging.error(f"Error processing files: {e}")
321
- # # raise HTTPException(status_code=500, detail="Internal Server Error")
322
- # return {
323
- # "status": 400,
324
- # "message": "Text extraction failed."
325
- # }
326
-
327
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ # from fastapi.middleware.cors import CORSMiddleware
3
+ # from typing import Dict
4
+ # import os
5
+ # import shutil
6
+ # import logging
7
+ # from s3_setup import s3_client
8
+
9
+ # import torch
10
+ # from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
11
+
12
+ # from dotenv import load_dotenv
13
+ # import os
14
+
15
+ # from utils import doc_processing
16
+
17
+ # # Load .env file
18
+ # load_dotenv()
19
+
20
+ # # Access variables
21
+ # dummy_key = os.getenv("dummy_key")
22
+ # HUGGINGFACE_AUTH_TOKEN = dummy_key
23
+
24
+
25
+ # # Hugging Face model and token
26
+ # aadhar_model = "AuditEdge/doc_ocr_a" # Replace with your fine-tuned model if applicable
27
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28
+ # print(f"Using device: {device}")
29
+
30
+ # # Load the processor (tokenizer + image processor)
31
+ # processor_aadhar = LayoutLMv3Processor.from_pretrained(
32
+ # aadhar_model,
33
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
34
+ # )
35
+ # aadhar_model = LayoutLMv3ForTokenClassification.from_pretrained(
36
+ # aadhar_model,
37
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
38
+ # )
39
+
40
+
41
+ # aadhar_model = aadhar_model.to(device)
42
+
43
+ # # pan model
44
+ # pan_model = "AuditEdge/doc_ocr_p" # Replace with your fine-tuned model if applicable
45
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
+ # print(f"Using device: {device}")
47
+
48
+
49
+
50
+ # # Load the processor (tokenizer + image processor)
51
+ # processor_pan = LayoutLMv3Processor.from_pretrained(
52
+ # pan_model,
53
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
54
+ # )
55
+ # pan_model = LayoutLMv3ForTokenClassification.from_pretrained(
56
+ # pan_model,
57
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
58
+ # )
59
+ # pan_model = pan_model.to(device)
60
+
61
+ # #
62
+ # # gst model
63
+ # gst_model = "AuditEdge/doc_ocr_new_g" # Replace with your fine-tuned model if applicable
64
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
65
+ # print(f"Using device: {device}")
66
+
67
+ # # Load the processor (tokenizer + image processor)
68
+ # processor_gst = LayoutLMv3Processor.from_pretrained(
69
+ # gst_model,
70
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
71
+ # )
72
+ # gst_model = LayoutLMv3ForTokenClassification.from_pretrained(
73
+ # gst_model,
74
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
75
+ # )
76
+ # gst_model = gst_model.to(device)
77
+
78
+ # #cheque model
79
+
80
+ # cheque_model = "AuditEdge/doc_ocr_new_c" # Replace with your fine-tuned model if applicable
81
+ # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
+ # print(f"Using device: {device}")
83
+
84
+ # # Load the processor (tokenizer + image processor)
85
+ # processor_cheque = LayoutLMv3Processor.from_pretrained(
86
+ # cheque_model,
87
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
88
+ # )
89
+ # cheque_model = LayoutLMv3ForTokenClassification.from_pretrained(
90
+ # cheque_model,
91
+ # use_auth_token=HUGGINGFACE_AUTH_TOKEN
92
+ # )
93
+ # cheque_model = cheque_model.to(device)
94
+
95
+
96
+
97
+
98
+
99
+
100
+ # # Verify model and processor are loaded
101
+ # print("Model and processor loaded successfully!")
102
+ # print(f"Model is on device: {next(aadhar_model.parameters()).device}")
103
+
104
+
105
+ # # Import inference modules
106
+ # from layoutlmv3FineTuning.Layoutlm_inference.ocr import prepare_batch_for_inference
107
+ # from layoutlmv3FineTuning.Layoutlm_inference.inference_handler import handle
108
+
109
+ # # Create FastAPI instance
110
+ # app = FastAPI(debug=True)
111
+
112
+ # # Enable CORS
113
+ # app.add_middleware(
114
+ # CORSMiddleware,
115
+ # allow_origins=["*"],
116
+ # allow_credentials=True,
117
+ # allow_methods=["*"],
118
+ # allow_headers=["*"],
119
+ # )
120
+
121
+ # # Configure directories
122
+ # UPLOAD_FOLDER = './uploads/'
123
+ # processing_folder = "./processed_images"
124
+ # os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Ensure the main upload folder exists
125
+ # os.makedirs(processing_folder,exist_ok=True)
126
+
127
+ # UPLOAD_DIRS = {
128
+ # "aadhar_file": "uploads/aadhar/",
129
+ # "pan_file": "uploads/pan/",
130
+ # "cheque_file": "uploads/cheque/",
131
+ # "gst_file": "uploads/gst/",
132
+ # }
133
+
134
+ # process_dirs = {
135
+ # "aadhar_file": "processed_images/aadhar/",
136
+ # "pan_file": "processed_images/pan/",
137
+ # "cheque_file": "processed_images/cheque/",
138
+ # "gst_file": "processed_images/gst/",
139
+
140
+ # }
141
+
142
+ # # Ensure individual directories exist
143
+ # for dir_path in UPLOAD_DIRS.values():
144
+ # os.makedirs(dir_path, exist_ok=True)
145
+
146
+ # for dir_path in process_dirs.values():
147
+ # os.makedirs(dir_path, exist_ok=True)
148
+
149
+
150
+
151
+ # # Logger configuration
152
+ # logging.basicConfig(level=logging.INFO)
153
+
154
+ # # Perform Inference
155
+ # def perform_inference(file_paths: Dict[str, str]):
156
+ # # Dictionary to map document types to their respective model directories
157
+ # model_dirs = {
158
+ # "aadhar_file": aadhar_model,
159
+ # "pan_file": pan_model,
160
+ # "cheque_file": cheque_model,
161
+ # "gst_file": gst_model,
162
+ # }
163
+ # try:
164
+ # # Dictionary to store results for each document type
165
+ # inference_results = {}
166
+
167
+ # # Loop through the file paths and perform inference
168
+ # for doc_type, file_path in file_paths.items():
169
+ # if doc_type in model_dirs:
170
+ # print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
171
+
172
+ # # Prepare batch for inference
173
+ # processed_file_p = file_path.split("&&")[0]
174
+ # unprocessed_file_path = file_path.split("&&")[1]
175
+
176
+ # images_path = [processed_file_p]
177
+ # inference_batch = prepare_batch_for_inference(images_path)
178
+
179
+ # # Prepare context for the specific document type
180
+ # # context = {"model_dir": model_dirs[doc_type]}
181
+ # #initialize s3 client
182
+ # client = s3_client()
183
+
184
+ # local_file_path= unprocessed_file_path
185
+ # bucket_name = "edgekycdocs"
186
+
187
+ # file_name = unprocessed_file_path.split("/")[-1]
188
+
189
+
190
+
191
+
192
+ # # context = aadhar_model
193
+ # if doc_type == "aadhar_file":
194
+ # context = aadhar_model
195
+ # processor = processor_aadhar
196
+ # name = "aadhar"
197
+ # attachemnt_num = 3
198
+ # folder_name = "aadhardocs"
199
+
200
+
201
+ # if doc_type == "pan_file":
202
+ # context = pan_model
203
+ # processor = processor_pan
204
+ # name = "pan"
205
+ # attachemnt_num = 2
206
+ # folder_name = "pandocs"
207
+
208
+ # if doc_type == "gst_file":
209
+ # context = gst_model
210
+ # processor = processor_gst
211
+ # name = "gst"
212
+ # attachemnt_num = 4
213
+ # folder_name = "gstdocs"
214
+
215
+ # if doc_type == "cheque_file":
216
+ # context = cheque_model
217
+ # processor = processor_cheque
218
+ # name = "cheque"
219
+ # attachemnt_num = 8
220
+ # folder_name = "bankchequedocs"
221
+
222
+
223
+
224
+ # # upload the document to s3 bucket here
225
+
226
+
227
+ # print("this is folder name",folder_name)
228
+
229
+ # response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
230
+
231
+ # print("The file has been uploaded to s3 bucket",response)
232
+
233
+
234
+ # # Perform inference (replace `handle` with your actual function)
235
+ # result = handle(inference_batch, context,processor,name)
236
+ # # result["attachment_url": response["url"]]
237
+ # result["attachment_url"] = response["url"]
238
+ # result["detect"] = True
239
+
240
+ # print("result required",result)
241
+
242
+ # # if result[""]
243
+
244
+ # # Store the result
245
+ # inference_results["attachment_{}".format(attachemnt_num)] = result
246
+ # else:
247
+ # print(f"Model directory not found for {doc_type}. Skipping.")
248
+ # # print(Javed)
249
+
250
+ # return inference_results
251
+ # except:
252
+ # return {
253
+ # "status": "error",
254
+ # "message": "Text extraction failed."
255
+ # }
256
+
257
+
258
+ # # Routes
259
+ # @app.get("/")
260
+ # def greet_json():
261
+ # return {"Hello": "World!"}
262
+
263
+ # @app.post("/api/aadhar_ocr")
264
+ # async def aadhar_ocr(
265
+ # aadhar_file: UploadFile = File(None),
266
+ # pan_file: UploadFile = File(None),
267
+ # cheque_file: UploadFile = File(None),
268
+ # gst_file: UploadFile = File(None),
269
+ # ):
270
+ # # try:
271
+ # # Handle file uploads
272
+ # file_paths = {}
273
+ # for file_type, folder in UPLOAD_DIRS.items():
274
+ # file = locals()[file_type] # Dynamically access the file arguments
275
+ # if file:
276
+ # # Save the file in the respective directory
277
+ # file_path = os.path.join(folder, file.filename)
278
+
279
+ # print("this is the filename",file.filename)
280
+ # with open(file_path, "wb") as buffer:
281
+ # shutil.copyfileobj(file.file, buffer)
282
+ # file_paths[file_type] = file_path
283
+
284
+ # # Log received files
285
+ # logging.info(f"Received files: {list(file_paths.keys())}")
286
+ # print("file_paths",file_paths)
287
+
288
+ # files = {}
289
+ # for key, value in file_paths.items():
290
+ # name = value.split("/")[-1].split(".")[0]
291
+ # id_type = key.split("_")[0]
292
+ # doc_type = value.split("/")[-1].split(".")[-1]
293
+ # f_path = value
294
+
295
+ # print("variables required",name,id_type,doc_type,f_path)
296
+ # preprocessing = doc_processing(name,id_type,doc_type,f_path)
297
+ # response = preprocessing.process()
298
+
299
+ # print("response after preprocessing",response)
300
+
301
+ # files[key] = response["output_p"] + "&&" + f_path
302
+ # # files["unprocessed_file_path"] = f_path
303
+ # print("response",response)
304
+
305
+
306
+ # # Perform inference
307
+ # result = perform_inference(files)
308
+
309
+ # print("this is the result we got",result)
310
+ # if "status" in list(result.keys()):
311
+ # raise Exception("Custom error message")
312
+ # # if result["status"] == "error":
313
+
314
+
315
+
316
+ # return {"status": "success", "result": result}
317
+
318
+
319
+ # # except Exception as e:
320
+ # # logging.error(f"Error processing files: {e}")
321
+ # # # raise HTTPException(status_code=500, detail="Internal Server Error")
322
+ # # return {
323
+ # # "status": 400,
324
+ # # "message": "Text extraction failed."
325
+ # # }
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
  from fastapi import FastAPI, File, UploadFile, HTTPException
334
  from fastapi.middleware.cors import CORSMiddleware
335
  from typing import Dict
336
  import os
337
  import shutil
338
+ import torch
339
  import logging
340
  from s3_setup import s3_client
341
+ import requests
342
+ from fastapi import FastAPI, HTTPException, Request
343
  from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
 
344
  from dotenv import load_dotenv
345
+ import urllib.parse
 
346
  from utils import doc_processing
347
 
348
  # Load .env file
 
352
  dummy_key = os.getenv("dummy_key")
353
  HUGGINGFACE_AUTH_TOKEN = dummy_key
354
 
 
355
  # Hugging Face model and token
356
  aadhar_model = "AuditEdge/doc_ocr_a" # Replace with your fine-tuned model if applicable
357
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
359
 
360
  # Load the processor (tokenizer + image processor)
361
  processor_aadhar = LayoutLMv3Processor.from_pretrained(
362
+ aadhar_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
363
  )
364
  aadhar_model = LayoutLMv3ForTokenClassification.from_pretrained(
365
+ aadhar_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
366
  )
367
 
368
 
 
374
  print(f"Using device: {device}")
375
 
376
 
 
377
  # Load the processor (tokenizer + image processor)
378
  processor_pan = LayoutLMv3Processor.from_pretrained(
379
+ pan_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
380
  )
381
  pan_model = LayoutLMv3ForTokenClassification.from_pretrained(
382
+ pan_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
383
  )
384
  pan_model = pan_model.to(device)
385
 
386
  #
387
  # gst model
388
+ gst_model = (
389
+ "AuditEdge/doc_ocr_new_g" # Replace with your fine-tuned model if applicable
390
+ )
391
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
392
  print(f"Using device: {device}")
393
 
394
  # Load the processor (tokenizer + image processor)
395
  processor_gst = LayoutLMv3Processor.from_pretrained(
396
+ gst_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
397
  )
398
  gst_model = LayoutLMv3ForTokenClassification.from_pretrained(
399
+ gst_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
400
  )
401
  gst_model = gst_model.to(device)
402
 
403
+ # cheque model
404
 
405
+ cheque_model = (
406
+ "AuditEdge/doc_ocr_new_c" # Replace with your fine-tuned model if applicable
407
+ )
408
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
409
  print(f"Using device: {device}")
410
 
411
  # Load the processor (tokenizer + image processor)
412
  processor_cheque = LayoutLMv3Processor.from_pretrained(
413
+ cheque_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
414
  )
415
  cheque_model = LayoutLMv3ForTokenClassification.from_pretrained(
416
+ cheque_model, use_auth_token=HUGGINGFACE_AUTH_TOKEN
 
417
  )
418
  cheque_model = cheque_model.to(device)
419
 
420
 
 
 
 
 
421
  # Verify model and processor are loaded
422
  print("Model and processor loaded successfully!")
423
  print(f"Model is on device: {next(aadhar_model.parameters()).device}")
 
440
  )
441
 
442
  # Configure directories
443
+ UPLOAD_FOLDER = "./uploads/"
444
  processing_folder = "./processed_images"
445
  os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Ensure the main upload folder exists
446
+ os.makedirs(processing_folder, exist_ok=True)
447
+
448
 
449
  UPLOAD_DIRS = {
 
450
  "pan_file": "uploads/pan/",
451
+ "aadhar_file": "uploads/aadhar/",
452
  "gst_file": "uploads/gst/",
453
+ "msme_file": "uploads/msme/",
454
+ "cin_llpin_file": "uploads/cin_llpin/",
455
+ "cheque_file": "uploads/cheque/",
456
  }
457
 
458
+
459
  process_dirs = {
460
  "aadhar_file": "processed_images/aadhar/",
461
  "pan_file": "processed_images/pan/",
462
  "cheque_file": "processed_images/cheque/",
463
  "gst_file": "processed_images/gst/",
 
464
  }
465
 
466
  # Ensure individual directories exist
467
  for dir_path in UPLOAD_DIRS.values():
468
  os.makedirs(dir_path, exist_ok=True)
469
+
470
  for dir_path in process_dirs.values():
471
  os.makedirs(dir_path, exist_ok=True)
472
+
 
473
 
474
  # Logger configuration
475
  logging.basicConfig(level=logging.INFO)
476
 
477
+
478
+ # Perform Inference with optional S3 upload
479
+ def perform_inference(file_paths: Dict[str, str], upload_to_s3: bool):
480
  model_dirs = {
 
481
  "pan_file": pan_model,
 
482
  "gst_file": gst_model,
483
+ "cheque_file": cheque_model,
484
  }
485
+ try:
 
486
  inference_results = {}
487
 
 
488
  for doc_type, file_path in file_paths.items():
489
  if doc_type in model_dirs:
490
  print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
491
 
 
492
  processed_file_p = file_path.split("&&")[0]
493
  unprocessed_file_path = file_path.split("&&")[1]
 
494
  images_path = [processed_file_p]
495
  inference_batch = prepare_batch_for_inference(images_path)
496
 
497
+ context = model_dirs[doc_type]
498
+ processor = globals()[f"processor_{doc_type.split('_')[0]}"]
499
+ name = doc_type.split("_")[0]
500
+ attachemnt_num = {
501
+ "pan_file": 2,
502
+ "gst_file": 4,
503
+ "msme_file": 5,
504
+ "cin_llpin_file": 6,
505
+ "cheque_file": 8,
506
+ }[doc_type]
507
+
508
+ if upload_to_s3:
509
+ client = s3_client()
510
+ bucket_name = "edgekycdocs"
511
+ folder_name = f"{name}docs"
512
+ file_name = unprocessed_file_path.split("/")[-1]
513
+ response = client.upload_file(
514
+ unprocessed_file_path, bucket_name, folder_name, file_name
515
+ )
516
+ print("The file has been uploaded to S3 bucket", response)
517
+ attachment_url = response["url"]
518
+ else:
519
+ attachment_url = None
520
+
521
+ result = handle(inference_batch, context, processor, name)
522
+ result["attachment_url"] = attachment_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  result["detect"] = True
524
 
525
+ inference_results[f"attachment_{attachemnt_num}"] = result
 
 
 
 
 
526
  else:
527
  print(f"Model directory not found for {doc_type}. Skipping.")
 
528
 
529
+ return inference_results
530
  except:
531
+ return {"status": "error", "message": "Text extraction failed."}
 
 
 
532
 
533
 
534
  # Routes
 
536
  def greet_json():
537
  return {"Hello": "World!"}
538
 
539
+
540
  @app.post("/api/aadhar_ocr")
541
  async def aadhar_ocr(
542
  aadhar_file: UploadFile = File(None),
543
  pan_file: UploadFile = File(None),
544
  cheque_file: UploadFile = File(None),
545
  gst_file: UploadFile = File(None),
546
+ msme_file: UploadFile = File(None),
547
+ cin_llpin_file: UploadFile = File(None),
548
+ upload_to_s3: bool = True,
549
  ):
550
  # try:
551
+ # Handle file uploads
552
  file_paths = {}
553
  for file_type, folder in UPLOAD_DIRS.items():
554
  file = locals()[file_type] # Dynamically access the file arguments
 
556
  # Save the file in the respective directory
557
  file_path = os.path.join(folder, file.filename)
558
 
559
+ print("this is the filename", file.filename)
560
  with open(file_path, "wb") as buffer:
561
  shutil.copyfileobj(file.file, buffer)
562
  file_paths[file_type] = file_path
563
 
564
  # Log received files
565
  logging.info(f"Received files: {list(file_paths.keys())}")
566
+ print("file_paths", file_paths)
567
+
568
  files = {}
569
  for key, value in file_paths.items():
570
  name = value.split("/")[-1].split(".")[0]
 
572
  doc_type = value.split("/")[-1].split(".")[-1]
573
  f_path = value
574
 
575
+ print("variables required", name, id_type, doc_type, f_path)
576
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
577
  response = preprocessing.process()
578
 
579
+ print("response after preprocessing", response)
580
 
581
  files[key] = response["output_p"] + "&&" + f_path
582
  # files["unprocessed_file_path"] = f_path
583
+ print("response", response)
584
 
 
585
  # Perform inference
586
+ result = perform_inference(files, upload_to_s3)
587
 
588
+ print("this is the result we got", result)
589
  if "status" in list(result.keys()):
590
  raise Exception("Custom error message")
591
  # if result["status"] == "error":
 
 
592
 
593
  return {"status": "success", "result": result}
594
 
595
 
596
+ @app.post("/api/document_ocr")
597
+ async def document_ocr_s3(request: Request):
598
+ try:
599
+ body = await request.json() # Read JSON body
600
+ logging.info(f"Received request body: {body}")
601
+ except Exception as e:
602
+ logging.error(f"Failed to parse JSON request: {e}")
603
+ raise HTTPException(status_code=400, detail="Invalid JSON payload")
604
+
605
+ # Extract file URLs
606
+ url_mapping = {
607
+ "pan_file": body.get("pan_file"),
608
+ "gst_file": body.get("gst_file"),
609
+ "msme_file": body.get("msme_file"),
610
+ "cin_llpin_file": body.get("cin_llpin_file"),
611
+ "cheque_file": body.get("cheque_file"),
612
+ }
613
+ upload_to_s3 = body.get("upload_to_s3", False)
614
+ logging.info(f"URL Mapping: {url_mapping}")
615
+ file_paths = {}
616
+ for file_type, url in url_mapping.items():
617
+ if url:
618
+ # local_filename = url.split("/")[-1]
619
+ local_filename = urllib.parse.unquote(url.split("/")[-1]).replace(" ", "_")
620
+ file_path = os.path.join(UPLOAD_DIRS[file_type], local_filename)
621
+
622
+ try:
623
+ logging.info(f"Attempting to download {url} for {file_type}...")
624
+ response = requests.get(url, stream=True)
625
+ response.raise_for_status()
626
+
627
+ with open(file_path, "wb") as buffer:
628
+ shutil.copyfileobj(response.raw, buffer)
629
+
630
+ file_paths[file_type] = file_path
631
+ logging.info(f"Successfully downloaded {file_type} to {file_path}")
632
+
633
+ except requests.exceptions.RequestException as e:
634
+ logging.error(f"Failed to download {url}: {e}")
635
+ raise HTTPException(
636
+ status_code=400, detail=f"Failed to download file from {url}"
637
+ )
638
+
639
+ logging.info(f"Downloaded files: {list(file_paths.keys())}")
640
+
641
+ files = {}
642
+ for key, value in file_paths.items():
643
+ name = value.split("/")[-1].split(".")[0]
644
+ id_type = key.split("_")[0]
645
+ doc_type = value.split("/")[-1].split(".")[-1]
646
+ f_path = value
647
+
648
+ preprocessing = doc_processing(name, id_type, doc_type, f_path)
649
+ response = preprocessing.process()
650
+
651
+ files[key] = response["output_p"] + "&&" + f_path
652
+
653
+ result = perform_inference(files, upload_to_s3)
654
+
655
+ if "status" in list(result.keys()):
656
+ raise HTTPException(status_code=500, detail="Custom error message")
657
+
658
+ return {"status": "success", "result": result}