Omkar008 commited on
Commit
d244902
·
verified ·
1 Parent(s): 54b757d

Update extract_and_store_supabase.py

Browse files
Files changed (1) hide show
  1. extract_and_store_supabase.py +230 -113
extract_and_store_supabase.py CHANGED
@@ -5,8 +5,8 @@ import requests
5
  from supabase_models import Supabase_Client
6
  from authenticate import get_access_token_v1
7
 
8
- def extract_structure_store_message(user_id:str,message_id:str , attachment_id:str,attachment_extension:str,email:str):
9
 
 
10
  if attachment_id and message_id:
11
  project_id = os.getenv('PROJECT_ID')
12
  processor_id = os.getenv('PROCESSOR_ID')
@@ -15,125 +15,242 @@ def extract_structure_store_message(user_id:str,message_id:str , attachment_id:s
15
  file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
16
  print(f"file_name: {file_name}")
17
  supabase = Supabase_Client().instance
18
- try:
19
- response = supabase.storage.from_("receipt_radar").download(
20
- file_name
21
- )
22
- base64_data = urlsafe_b64encode(response).decode('utf-8')
23
-
24
- payload = {
25
- "skipHumanReview": True,
26
- "rawDocument": {
27
- "mimeType": f"application/{attachment_extension}",
28
- "content": base64_data
29
- }
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- access_token = get_access_token_v1()
33
- print(access_token)
34
 
35
- headers = {
36
- 'Authorization': f'Bearer {access_token}',
37
- 'Content-Type': 'application/json; charset=utf-8'
38
- }
39
 
40
- response = requests.post(
41
- f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
42
- headers=headers,
43
- json=payload
44
- )
45
- response_json = response.json()
46
- allowed_entities = [
47
- "due_date",
48
- "invoice_date",
49
- "total_amount",
50
- "total_tax_amount",
51
- "receiver_name",
52
- "invoice_id",
53
- "currency",
54
- "receiver_address",
55
- "invoice_type",
56
- "supplier_name",
57
- "payment_terms",
58
- "line_item",
59
- "line_item/description",
60
- "line_item/quantity",
61
- "line_item/amount",
62
- "line_item/unit_price"
63
- ]
64
- raw_text = response_json.get('document').get('text' , None)
65
- entities = response_json.get('document').get('entities' , None)
66
- document_entities['user_id'] = user_id
67
- # insert_ocr_data_response = (
68
- # supabase.table("receipt_ocr_data")
69
- # .insert({'user_id':user_id , 'message_id':message_id,'receipt_text':raw_text ,'email':email,'file_type':attachment_extension})
70
- # .execute()
71
- # )
72
 
73
- print('Printing entities')
74
- print(entities)
75
- # if entities is not None:
76
- # for ent in entities:
77
- # if ent.get('type') is not None:
78
- # if ent.get('type') in allowed_entities:
79
- # mention_text = ent.get('mentionText')
80
- # normalised_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
81
- # document_entities[ent.get('type')] = {"mention_text":mention_text,"normalizedValue":normalised_values}
82
- if entities is not None:
83
- for ent in entities:
84
- if ent.get('type') is not None:
85
- entity_type = ent.get('type') or ""
86
 
87
- # Check if the entity type is in the allowed list
88
- if entity_type in allowed_entities:
89
- mention_text = ent.get('mentionText') or ""
90
- normalized_values = ent.get('normalizedValue') or ""
91
 
92
- # Initialize a list for the entity type if not already present
93
- if entity_type not in document_entities:
94
- document_entities[entity_type] = []
95
 
96
- # Append the entity data to the list
97
- document_entities[entity_type].append({
98
- "mention_text": mention_text,
99
- "normalizedValue": normalized_values
100
- })
101
 
102
- # Handling 'line_item' and its properties (line_item/description, line_item/quantity, etc.)
103
- if entity_type == 'line_item' and 'properties' in ent:
104
- for prop in ent['properties']:
105
- prop_type = prop.get('type') or ""
106
- if prop_type in allowed_entities:
107
- mention_text = prop.get('mentionText') or ""
108
- normalized_values = prop.get('normalizedValue') or ""
109
 
110
- # Initialize a list for the property type if not already present
111
- if prop_type not in document_entities:
112
- document_entities[prop_type] = []
113
 
114
- # Append the property data to the list
115
- document_entities[prop_type].append({
116
- "mention_text": mention_text,
117
- "normalizedValue": normalized_values
118
- })
119
- document_entities['line_item_description'] = document_entities['line_item/description']
120
- document_entities['line_item_quantity'] = document_entities['line_item/quantity']
121
- document_entities['line_item_amount'] = document_entities['line_item/amount']
122
- document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
123
- document_entities.pop('line_item/description', None)
124
- document_entities.pop('line_item/quantity', None)
125
- document_entities.pop('line_item/amount', None)
126
- document_entities.pop('line_item/unit_price', None)
127
- document_entities['email'] = email
128
- document_entities['message_id'] = message_id
129
- print("Printing parsed json")
130
- print(document_entities)
131
- # insert_data_response = (
132
- # supabase.table("document_ai_entities")
133
- # .insert(document_entities)
134
- # .execute()
135
- # )
136
- # print(insert_data_response)
137
- return document_entities
138
- except Exception as e:
139
- print(f"Error downloading or encoding file: {e}")
 
5
  from supabase_models import Supabase_Client
6
  from authenticate import get_access_token_v1
7
 
 
8
 
9
+ def extract_structure_store_message(user_id: str, message_id: str, attachment_id: str, attachment_extension: str, email: str):
10
  if attachment_id and message_id:
11
  project_id = os.getenv('PROJECT_ID')
12
  processor_id = os.getenv('PROCESSOR_ID')
 
15
  file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
16
  print(f"file_name: {file_name}")
17
  supabase = Supabase_Client().instance
18
+
19
+ response = supabase.storage.from_("receipt_radar").download(file_name)
20
+ base64_data = urlsafe_b64encode(response).decode('utf-8')
21
+
22
+ payload = {
23
+ "skipHumanReview": True,
24
+ "rawDocument": {
25
+ "mimeType": f"application/{attachment_extension}",
26
+ "content": base64_data
27
+ }
28
+ }
29
+
30
+ access_token = get_access_token_v1()
31
+ print(access_token)
32
+
33
+ headers = {
34
+ 'Authorization': f'Bearer {access_token}',
35
+ 'Content-Type': 'application/json; charset=utf-8'
36
+ }
37
+
38
+ response = requests.post(
39
+ f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
40
+ headers=headers,
41
+ json=payload
42
+ )
43
+ response_json = response.json()
44
+ allowed_entities = [
45
+ "due_date",
46
+ "invoice_date",
47
+ "total_amount",
48
+ "total_tax_amount",
49
+ "receiver_name",
50
+ "invoice_id",
51
+ "currency",
52
+ "receiver_address",
53
+ "invoice_type",
54
+ "supplier_name",
55
+ "payment_terms",
56
+ "line_item",
57
+ "line_item/description",
58
+ "line_item/quantity",
59
+ "line_item/amount",
60
+ "line_item/unit_price"
61
+ ]
62
+ raw_text = response_json.get('document').get('text', None)
63
+ entities = response_json.get('document').get('entities', None)
64
+ document_entities['user_id'] = user_id
65
+
66
+ print('Printing entities')
67
+ print(entities)
68
+
69
+ if entities is not None:
70
+ for ent in entities:
71
+ if ent.get('type') is not None:
72
+ entity_type = ent.get('type') or ""
73
+
74
+ # Check if the entity type is in the allowed list
75
+ if entity_type in allowed_entities:
76
+ mention_text = ent.get('mentionText') or ""
77
+ normalized_values = ent.get('normalizedValue') or ""
78
+
79
+ # Initialize a list for the entity type if not already present
80
+ if entity_type not in document_entities:
81
+ document_entities[entity_type] = []
82
+
83
+ # Append the entity data to the list
84
+ document_entities[entity_type].append({
85
+ "mention_text": mention_text,
86
+ "normalizedValue": normalized_values
87
+ })
88
+
89
+ # Handling 'line_item' and its properties (line_item/description, line_item/quantity, etc.)
90
+ if entity_type == 'line_item' and 'properties' in ent:
91
+ for prop in ent['properties']:
92
+ prop_type = prop.get('type') or ""
93
+ if prop_type in allowed_entities:
94
+ mention_text = prop.get('mentionText') or ""
95
+ normalized_values = prop.get('normalizedValue') or ""
96
+
97
+ # Initialize a list for the property type if not already present
98
+ if prop_type not in document_entities:
99
+ document_entities[prop_type] = []
100
+
101
+ # Append the property data to the list
102
+ document_entities[prop_type].append({
103
+ "mention_text": mention_text,
104
+ "normalizedValue": normalized_values
105
+ })
106
+
107
+ document_entities['line_item_description'] = document_entities['line_item/description']
108
+ document_entities['line_item_quantity'] = document_entities['line_item/quantity']
109
+ document_entities['line_item_amount'] = document_entities['line_item/amount']
110
+ document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
111
+ document_entities.pop('line_item/description', None)
112
+ document_entities.pop('line_item/quantity', None)
113
+ document_entities.pop('line_item/amount', None)
114
+ document_entities.pop('line_item/unit_price', None)
115
+ document_entities['email'] = email
116
+ document_entities['message_id'] = message_id
117
+ print("Printing parsed json")
118
+ print(document_entities)
119
+
120
+ return document_entities
121
+
122
+
123
+
124
+
125
+ # def extract_structure_store_message(user_id:str,message_id:str , attachment_id:str,attachment_extension:str,email:str):
126
+
127
+ # if attachment_id and message_id:
128
+ # project_id = os.getenv('PROJECT_ID')
129
+ # processor_id = os.getenv('PROCESSOR_ID')
130
+ # document_entities = {}
131
+
132
+ # file_name = f"{message_id}_{attachment_id}.{attachment_extension}"
133
+ # print(f"file_name: {file_name}")
134
+ # supabase = Supabase_Client().instance
135
+ # try:
136
+ # response = supabase.storage.from_("receipt_radar").download(
137
+ # file_name
138
+ # )
139
+ # base64_data = urlsafe_b64encode(response).decode('utf-8')
140
+
141
+ # payload = {
142
+ # "skipHumanReview": True,
143
+ # "rawDocument": {
144
+ # "mimeType": f"application/{attachment_extension}",
145
+ # "content": base64_data
146
+ # }
147
+ # }
148
 
149
+ # access_token = get_access_token_v1()
150
+ # print(access_token)
151
 
152
+ # headers = {
153
+ # 'Authorization': f'Bearer {access_token}',
154
+ # 'Content-Type': 'application/json; charset=utf-8'
155
+ # }
156
 
157
+ # response = requests.post(
158
+ # f'https://us-documentai.googleapis.com/v1/projects/{project_id}/locations/us/processors/{processor_id}:process',
159
+ # headers=headers,
160
+ # json=payload
161
+ # )
162
+ # response_json = response.json()
163
+ # allowed_entities = [
164
+ # "due_date",
165
+ # "invoice_date",
166
+ # "total_amount",
167
+ # "total_tax_amount",
168
+ # "receiver_name",
169
+ # "invoice_id",
170
+ # "currency",
171
+ # "receiver_address",
172
+ # "invoice_type",
173
+ # "supplier_name",
174
+ # "payment_terms",
175
+ # "line_item",
176
+ # "line_item/description",
177
+ # "line_item/quantity",
178
+ # "line_item/amount",
179
+ # "line_item/unit_price"
180
+ # ]
181
+ # raw_text = response_json.get('document').get('text' , None)
182
+ # entities = response_json.get('document').get('entities' , None)
183
+ # document_entities['user_id'] = user_id
184
+ # # insert_ocr_data_response = (
185
+ # # supabase.table("receipt_ocr_data")
186
+ # # .insert({'user_id':user_id , 'message_id':message_id,'receipt_text':raw_text ,'email':email,'file_type':attachment_extension})
187
+ # # .execute()
188
+ # # )
189
 
190
+ # print('Printing entities')
191
+ # print(entities)
192
+ # # if entities is not None:
193
+ # # for ent in entities:
194
+ # # if ent.get('type') is not None:
195
+ # # if ent.get('type') in allowed_entities:
196
+ # # mention_text = ent.get('mentionText')
197
+ # # normalised_values = ent.get('normalizedValue') if 'normalizedValue' in ent else None
198
+ # # document_entities[ent.get('type')] = {"mention_text":mention_text,"normalizedValue":normalised_values}
199
+ # if entities is not None:
200
+ # for ent in entities:
201
+ # if ent.get('type') is not None:
202
+ # entity_type = ent.get('type') or ""
203
 
204
+ # # Check if the entity type is in the allowed list
205
+ # if entity_type in allowed_entities:
206
+ # mention_text = ent.get('mentionText') or ""
207
+ # normalized_values = ent.get('normalizedValue') or ""
208
 
209
+ # # Initialize a list for the entity type if not already present
210
+ # if entity_type not in document_entities:
211
+ # document_entities[entity_type] = []
212
 
213
+ # # Append the entity data to the list
214
+ # document_entities[entity_type].append({
215
+ # "mention_text": mention_text,
216
+ # "normalizedValue": normalized_values
217
+ # })
218
 
219
+ # # Handling 'line_item' and its properties (line_item/description, line_item/quantity, etc.)
220
+ # if entity_type == 'line_item' and 'properties' in ent:
221
+ # for prop in ent['properties']:
222
+ # prop_type = prop.get('type') or ""
223
+ # if prop_type in allowed_entities:
224
+ # mention_text = prop.get('mentionText') or ""
225
+ # normalized_values = prop.get('normalizedValue') or ""
226
 
227
+ # # Initialize a list for the property type if not already present
228
+ # if prop_type not in document_entities:
229
+ # document_entities[prop_type] = []
230
 
231
+ # # Append the property data to the list
232
+ # document_entities[prop_type].append({
233
+ # "mention_text": mention_text,
234
+ # "normalizedValue": normalized_values
235
+ # })
236
+ # document_entities['line_item_description'] = document_entities['line_item/description']
237
+ # document_entities['line_item_quantity'] = document_entities['line_item/quantity']
238
+ # document_entities['line_item_amount'] = document_entities['line_item/amount']
239
+ # document_entities['line_item_unit_price'] = document_entities['line_item/unit_price']
240
+ # document_entities.pop('line_item/description', None)
241
+ # document_entities.pop('line_item/quantity', None)
242
+ # document_entities.pop('line_item/amount', None)
243
+ # document_entities.pop('line_item/unit_price', None)
244
+ # document_entities['email'] = email
245
+ # document_entities['message_id'] = message_id
246
+ # print("Printing parsed json")
247
+ # print(document_entities)
248
+ # # insert_data_response = (
249
+ # # supabase.table("document_ai_entities")
250
+ # # .insert(document_entities)
251
+ # # .execute()
252
+ # # )
253
+ # # print(insert_data_response)
254
+ # return document_entities
255
+ # except Exception as e:
256
+ # print(f"Error downloading or encoding file: {e}")