Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2image import convert_from_path
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
from google.colab.patches import cv2_imshow
|
5 |
+
import numpy as np
|
6 |
+
from PIL import Image
|
7 |
+
import json
|
8 |
+
from anthropic import Anthropic, Client
|
9 |
+
import gradio as gr
|
10 |
+
|
11 |
+
## Process pdf
|
12 |
+
|
13 |
+
def convert_pdf_to_image(pdf_path):
|
14 |
+
# Convert PDF to images
|
15 |
+
pages = convert_from_path(pdf_path, dpi=400)
|
16 |
+
|
17 |
+
# Save images as PNG files
|
18 |
+
for i, page in enumerate(pages):
|
19 |
+
page.save(f'page_{i}.png', 'PNG')
|
20 |
+
|
21 |
+
print(f"Converted {len(pages)} pages to images.")
|
22 |
+
return pages
|
23 |
+
|
24 |
+
|
25 |
+
## Image process Subprocess - De-stamp
|
26 |
+
def destamp_image(img_path):
|
27 |
+
|
28 |
+
bgr_img = cv2.imread(img_path)
|
29 |
+
hsv_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2HSV)
|
30 |
+
# Convert the BGR image to grayscale
|
31 |
+
gray_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2GRAY)
|
32 |
+
|
33 |
+
# HSV ragne: (0-180, 0-255, 0-120)
|
34 |
+
# for character black color:
|
35 |
+
# H: 0-180,
|
36 |
+
# S: 0-255 ,
|
37 |
+
# V: 0-120 ,
|
38 |
+
|
39 |
+
lower_black = np.array([0,0,0])
|
40 |
+
upper_black = np.array([180,255,120])
|
41 |
+
mask = cv2.inRange(hsv_img, lower_black, upper_black)
|
42 |
+
deRed_img = ~mask # Single channel image
|
43 |
+
# imshow mask
|
44 |
+
#print(f"deRed_img shape: {deRed_img.shape}")
|
45 |
+
#show_image(deRed_img)
|
46 |
+
|
47 |
+
# thresholding -2
|
48 |
+
ret, threshold_img_2 = cv2.threshold(deRed_img, 120, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
|
49 |
+
#print(f' threshold-2 shape: {threshold_img_2.shape}')
|
50 |
+
#show_image(threshold_img_2)
|
51 |
+
|
52 |
+
|
53 |
+
# Desired shape: (x, y, 1)
|
54 |
+
new_shape = (threshold_img_2.shape[0], threshold_img_2.shape[1], 1)
|
55 |
+
|
56 |
+
# Resize using numpy.resize()
|
57 |
+
result_img = np.resize(threshold_img_2, new_shape)
|
58 |
+
print(f"result_img.shape: {result_img.shape}")
|
59 |
+
show_image(result_img)
|
60 |
+
|
61 |
+
#save result_img
|
62 |
+
result_filepath="result_img_0.png"
|
63 |
+
cv2.imwrite(result_filepath, result_img)
|
64 |
+
|
65 |
+
return result_filepath
|
66 |
+
|
67 |
+
|
68 |
+
def extract_image_table(image_path):
|
69 |
+
|
70 |
+
# extract table information
|
71 |
+
response = {}
|
72 |
+
response = extract_table_info(image_path)
|
73 |
+
|
74 |
+
# Get text element from response
|
75 |
+
check_response(response)
|
76 |
+
# Extract response.content[0].text
|
77 |
+
json_data = extract_json(response) #type(json_data) = "dict"
|
78 |
+
print(f"json_data: {json_data}")
|
79 |
+
return json_data
|
80 |
+
|
81 |
+
|
82 |
+
## Extract Table Information
|
83 |
+
def extract_table_info(image_path):
|
84 |
+
# Claude
|
85 |
+
client = Anthropic(api_key=my_api_key) # Pass the API key here
|
86 |
+
MODEL_NAME = "claude-3-5-sonnet-20240620"
|
87 |
+
#Do ascending sort with index of value of "代碼" for all the rows in each section. If there is "X" or "x" in "代碼", treat it as "9".
|
88 |
+
message_list = [
|
89 |
+
{
|
90 |
+
"role": "user",
|
91 |
+
"content": [
|
92 |
+
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
|
93 |
+
#{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"},
|
94 |
+
{
|
95 |
+
"type": "text",
|
96 |
+
"text": """
|
97 |
+
Please extract the table information of the image, keep the context in Traditional Chinese without translation.
|
98 |
+
if you can not recognize the value precisely, please infer it and try to make a best guess.
|
99 |
+
If you can not make the best guess, please return “UNK”.
|
100 |
+
Create a structured set of data in json format providing key information about a table.
|
101 |
+
Keep the section titles in the table as a parts of json.
|
102 |
+
Be sure to extract the information of "代碼", and save them as part of json.
|
103 |
+
All the value extracted are string, including the "代碼".
|
104 |
+
Do not do any sort operation with all the rows.
|
105 |
+
Extract the text information of each cell precisely. Do not make inference between "代碼" and "項目" if you can not extract it precisely.
|
106 |
+
JSON fields must be labelled as:
|
107 |
+
Example json structure is:
|
108 |
+
<json>
|
109 |
+
{
|
110 |
+
"table meta": [
|
111 |
+
{"企業名稱": },
|
112 |
+
{"表頭名稱": },
|
113 |
+
{"報表日期": },
|
114 |
+
{"幣別": },
|
115 |
+
...
|
116 |
+
...
|
117 |
+
...
|
118 |
+
],
|
119 |
+
"table detail": [
|
120 |
+
{
|
121 |
+
...
|
122 |
+
...
|
123 |
+
...
|
124 |
+
},
|
125 |
+
{
|
126 |
+
...
|
127 |
+
...
|
128 |
+
...
|
129 |
+
},
|
130 |
+
...
|
131 |
+
...
|
132 |
+
...
|
133 |
+
]
|
134 |
+
}
|
135 |
+
|
136 |
+
</json>
|
137 |
+
|
138 |
+
|
139 |
+
Output the json structure as a string starting with <json> and ending with </json> XML tags.
|
140 |
+
Do not return any narrative language. Look at the images in detail.
|
141 |
+
Do not insert and control code, like line feed, tab indent: "\n"
|
142 |
+
|
143 |
+
IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”.
|
144 |
+
|
145 |
+
Example:
|
146 |
+
|
147 |
+
<json>
|
148 |
+
{
|
149 |
+
"table meta": [
|
150 |
+
{"企業名稱": "台灣水泥股份有限公司"},
|
151 |
+
{"表頭名稱": "個體資產負債表"},
|
152 |
+
{"報表日期": "民國 112 年及 111 年 12 月 31 日"},
|
153 |
+
{"幣別": "新台幣仟元"},
|
154 |
+
...
|
155 |
+
...
|
156 |
+
...
|
157 |
+
|
158 |
+
],
|
159 |
+
"table detail": [
|
160 |
+
{
|
161 |
+
"資產": [
|
162 |
+
{ "流動資產":
|
163 |
+
[
|
164 |
+
{
|
165 |
+
"代碼": "1100",
|
166 |
+
"項目": "現金及約當現金(附註四及六)",
|
167 |
+
"112年12月31日金額": "1,516,633",
|
168 |
+
"112年12月31日%": "-",
|
169 |
+
"111年12月31日金額": "4,243,295",
|
170 |
+
"111年12月31日%": "1"
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"代碼": "1110",
|
174 |
+
"項目": "透過損益按公允價值衡量之金融資產(附註四、七及二六)",
|
175 |
+
"112年12月31日金額": "341,056",
|
176 |
+
"112年12月31日%": "-",
|
177 |
+
"111年12月31日金額": "259,919",
|
178 |
+
"111年12月31日%": "-"
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"代碼": "1120",
|
182 |
+
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
|
183 |
+
"112年12月31日金額": "4,333,594",
|
184 |
+
"112年12月31日%": "1",
|
185 |
+
"111年12月31日金額": "3,607,819",
|
186 |
+
"111年12月31日%": "1"
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"代碼": "1150",
|
190 |
+
"項目": "應收票據及帳款淨額(附註四及九)",
|
191 |
+
"112年12月31日金額": "5,801,135",
|
192 |
+
"112年12月31日%": "2",
|
193 |
+
"111年12月31日金額": "5,319,368",
|
194 |
+
"111年12月31日%": "1"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"代碼": "1180",
|
198 |
+
"項目": "應收票據及帳款-關係人(附註四及二七)",
|
199 |
+
"112年12月31日金額": "572,118",
|
200 |
+
"112年12月31日%": "-",
|
201 |
+
"111年12月31日金額": "681,793",
|
202 |
+
"111年12月31日%": "-"
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"代碼": "130X",
|
206 |
+
"項目": "存貨(附註四及十)",
|
207 |
+
"112年12月31日金額": "1,782,735",
|
208 |
+
"112年12月31日%": "1",
|
209 |
+
"111年12月31日金額": "2,321,850",
|
210 |
+
"111年12月31日%": "1"
|
211 |
+
},
|
212 |
+
{
|
213 |
+
"代碼": "1470",
|
214 |
+
"項目": "其他流動資產(附註二一及二七)",
|
215 |
+
"112年12月31日金額": "411,540",
|
216 |
+
"112年12月31日%": "-",
|
217 |
+
"111年12月31日金額": "248,683",
|
218 |
+
"111年12月31日%": "-"
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"代碼": "11XX",
|
222 |
+
"項目": "流動資產總計",
|
223 |
+
"112年12月31日金額": "14,758,811",
|
224 |
+
"112年12月31日%": "4",
|
225 |
+
"111年12月31日金額": "16,682,727",
|
226 |
+
"111年12月31日%": "4"
|
227 |
+
}
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"非流動資產": [
|
232 |
+
{
|
233 |
+
"代碼": "1517",
|
234 |
+
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
|
235 |
+
"112年12月31日金額": "9,638,255",
|
236 |
+
"112年12月31日%": "3",
|
237 |
+
"111年12月31日金額": "7,633,603",
|
238 |
+
"111年12月31日%": "2"
|
239 |
+
},
|
240 |
+
{
|
241 |
+
"代碼": "1550",
|
242 |
+
"項目": "採用權益法之投資(附註四、五及十一)",
|
243 |
+
"112年12月31日金額": "312,351,291",
|
244 |
+
"112年12月31日%": "82",
|
245 |
+
"111年12月31日金額": "307,101,709",
|
246 |
+
"111年12月31日%": "82"
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"代碼": "1600",
|
250 |
+
"項目": "不動產、廠房及設備(附註四、五、十二、十三及二八)",
|
251 |
+
"112年12月31日金額": "28,052,603",
|
252 |
+
"112年12月31日%": "7",
|
253 |
+
"111年12月31日金額": "35,583,596",
|
254 |
+
"111年12月31日%": "10"
|
255 |
+
},
|
256 |
+
{
|
257 |
+
"代碼": "1755",
|
258 |
+
"項目": "使用權資產(附註四、十五、二十、二七)",
|
259 |
+
"112年12月31日金額": "1,797,820",
|
260 |
+
"112年12月31日%": "1",
|
261 |
+
"111年12月31日金額": "1,788,972",
|
262 |
+
"111年12月31日%": "1"
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"代碼": "1760",
|
266 |
+
"項目": "投資性不動產(附註四、十四及二十)",
|
267 |
+
"112年12月31日金額": "13,042,677",
|
268 |
+
"112年12月31日%": "3",
|
269 |
+
"111年12月31日金額": "2,436,675",
|
270 |
+
"111年12月31日%": "-"
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"代碼": "1821",
|
274 |
+
"項目": "無形資產(附註四及二十)",
|
275 |
+
"112年12月31日金額": "58,840",
|
276 |
+
"112年12月31日%": "-",
|
277 |
+
"111年12月31日金額": "64,956",
|
278 |
+
"111年12月31日%": "-"
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"代碼": "1915",
|
282 |
+
"項目": "預付設備款",
|
283 |
+
"112年12月31日金額": "600,042",
|
284 |
+
"112年12月31日%": "-",
|
285 |
+
"111年12月31日金額": "682,765",
|
286 |
+
"111年12月31日%": "-"
|
287 |
+
},
|
288 |
+
{
|
289 |
+
"代碼": "1975",
|
290 |
+
"項目": "淨確定福利資產(附註四及十八)",
|
291 |
+
"112年12月31日金額": "1,507,153",
|
292 |
+
"112年12月31日%": "-",
|
293 |
+
"111年12月31日金額": "1,526,546",
|
294 |
+
"111年12月31日%": "-"
|
295 |
+
},
|
296 |
+
{
|
297 |
+
"代碼": "1990",
|
298 |
+
"項目": "其他非流動資產(附註四、六、二一及二八)",
|
299 |
+
"112年12月31日金額": "827,628",
|
300 |
+
"112年12月31日%": "-",
|
301 |
+
"111年12月31日金額": "840,688",
|
302 |
+
"111年12月31日%": "1"
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"代碼": "15XX",
|
306 |
+
"項目": "非流動資產總計",
|
307 |
+
"112年12月31日金額": "367,876,309",
|
308 |
+
"112年12月31日%": "96",
|
309 |
+
"111年12月31日金額": "357,659,510",
|
310 |
+
"111年12月31日%": "96"
|
311 |
+
}]
|
312 |
+
},
|
313 |
+
{
|
314 |
+
"代碼": "1XXX",
|
315 |
+
"項目": "資產總計",
|
316 |
+
"112年12月31日金額": "382,635,120",
|
317 |
+
"112年12月31日%": "100",
|
318 |
+
"111年12月31日金額": "374,342,237",
|
319 |
+
"111年12月31日%": "100"
|
320 |
+
}
|
321 |
+
]
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"負債": [
|
325 |
+
...
|
326 |
+
...
|
327 |
+
...
|
328 |
+
]
|
329 |
+
},
|
330 |
+
...
|
331 |
+
...
|
332 |
+
...
|
333 |
+
]
|
334 |
+
}
|
335 |
+
</json>
|
336 |
+
|
337 |
+
"""
|
338 |
+
|
339 |
+
}
|
340 |
+
]
|
341 |
+
}
|
342 |
+
]
|
343 |
+
|
344 |
+
|
345 |
+
# Update how the API is called
|
346 |
+
response = client.messages.create(
|
347 |
+
model=MODEL_NAME,
|
348 |
+
max_tokens=8192, # limit the amount of response information
|
349 |
+
messages=message_list,
|
350 |
+
temperature=0.7,
|
351 |
+
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
|
352 |
+
)
|
353 |
+
tokens = response.usage.output_tokens
|
354 |
+
print(f"Generated Tokens: {tokens}")
|
355 |
+
#print(f"Response: {response}")
|
356 |
+
return response
|
357 |
+
|
358 |
+
## Check Response
|
359 |
+
def check_response(response):
|
360 |
+
# Check the type and content of the response
|
361 |
+
print(type(response.content))
|
362 |
+
print(response.content)
|
363 |
+
|
364 |
+
# Assuming the text content is in the first element of the list
|
365 |
+
if isinstance(response.content, list) and response.content:
|
366 |
+
content_text = response.content[0].text
|
367 |
+
#print(json.dumps(content_text, sort_keys=True, indent=4))
|
368 |
+
else:
|
369 |
+
print("Unexpected response format. Unable to extract text.")
|
370 |
+
return None
|
371 |
+
|
372 |
+
## Extract Json data
|
373 |
+
def extract_json(response):
|
374 |
+
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
375 |
+
# Try to find the start and end of the JSON object more robustly
|
376 |
+
# skip <json>
|
377 |
+
|
378 |
+
json_start = response_text.find("<json>")+6 # Skip the <json> tag
|
379 |
+
json_end = response_text.rfind("</json>") # Include the closing brace
|
380 |
+
|
381 |
+
# Check if valid start and end indices were found
|
382 |
+
if json_start >= 0 and json_end > json_start:
|
383 |
+
try:
|
384 |
+
return json.loads(response_text[json_start:json_end])
|
385 |
+
except json.JSONDecodeError as e:
|
386 |
+
print(f"Error decoding JSON: {e}")
|
387 |
+
print(f"Problematic JSON string: {response_text[json_start+1:json_end]}")
|
388 |
+
return {response_text[json_start+1:json_end]}
|
389 |
+
else:
|
390 |
+
print("Could not find valid JSON object in response.")
|
391 |
+
|
392 |
+
return
|
393 |
+
|
394 |
+
## Convert json to Dataframe
|
395 |
+
|
396 |
+
## Convert to csv
|
397 |
+
|
398 |
+
## Process PDF
|
399 |
+
def pipeline(pdf_path):
|
400 |
+
pages = convert_pdf_to_image(pdf_path)
|
401 |
+
print(f"pages: {pages}")
|
402 |
+
destamp_img = destamp_image("page_0.png")
|
403 |
+
response = {}
|
404 |
+
response = extract_table_info(destamp_img)
|
405 |
+
check_response(response)
|
406 |
+
json_data = extract_json(response)
|
407 |
+
return len(pages), destamp_img, json_data
|
408 |
+
|
409 |
+
## Gradio Interface
|
410 |
+
title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
|
411 |
+
description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
|
412 |
+
examples = [['text_pdf.pdf'], ['image_pdf.pdf']]
|
413 |
+
|
414 |
+
pdf_file = gr.File(label="Upload PDF", type="filepath")
|
415 |
+
pages = gr.File(label="Pages", type="filepath")
|
416 |
+
num_pages = gr.Number(label="Number of Pages")
|
417 |
+
destamp_img = gr.Image(type="numpy", label="De-stamped Image")
|
418 |
+
json_data = gr.JSON(label="JSON Data")
|
419 |
+
|
420 |
+
|
421 |
+
app = gr.Interface(fn=pipeline,
|
422 |
+
inputs=pdf_file,
|
423 |
+
outputs=[num_pages, destamp_img, json_data],
|
424 |
+
title=title,
|
425 |
+
description=description,
|
426 |
+
examples=examples)
|
427 |
+
app.queue()
|
428 |
+
app.launch(debug=True, share=True)
|
429 |
+
#app.launch()
|