Spaces:
Runtime error
Runtime error
from pdf2image import convert_from_path | |
import cv2 | |
import numpy as np | |
import numpy as np | |
from PIL import Image | |
import json | |
from anthropic import Anthropic, Client | |
import gradio as gr | |
def get_base64_encorded_image(image_path): | |
with open(image_path, "rb") as image_file: | |
binary_data = image_file.read() | |
base64_encorded_data = base64.b64encode(binary_data) | |
base64_string = base64_encorded_data.decode('utf-8') | |
return base64_string | |
## Process pdf | |
def convert_pdf_to_image(pdf_path): | |
# Convert PDF to images | |
pages = convert_from_path(pdf_path, dpi=400) | |
# Save images as PNG files | |
for i, page in enumerate(pages): | |
page.save(f'page_{i}.png', 'PNG') | |
print(f"Converted {len(pages)} pages to images.") | |
return pages | |
## Image process Subprocess - De-stamp | |
def destamp_image(img_path): | |
bgr_img = cv2.imread(img_path) | |
hsv_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2HSV) | |
# Convert the BGR image to grayscale | |
gray_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2GRAY) | |
# HSV ragne: (0-180, 0-255, 0-120) | |
# for character black color: | |
# H: 0-180, | |
# S: 0-255 , | |
# V: 0-120 , | |
lower_black = np.array([0,0,0]) | |
upper_black = np.array([180,255,120]) | |
mask = cv2.inRange(hsv_img, lower_black, upper_black) | |
deRed_img = ~mask # Single channel image | |
# imshow mask | |
#print(f"deRed_img shape: {deRed_img.shape}") | |
#show_image(deRed_img) | |
# thresholding -2 | |
ret, threshold_img_2 = cv2.threshold(deRed_img, 120, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) | |
#print(f' threshold-2 shape: {threshold_img_2.shape}') | |
#show_image(threshold_img_2) | |
# Desired shape: (x, y, 1) | |
new_shape = (threshold_img_2.shape[0], threshold_img_2.shape[1], 1) | |
# Resize using numpy.resize() | |
result_img = np.resize(threshold_img_2, new_shape) | |
print(f"result_img.shape: {result_img.shape}") | |
cv2.imshow(result_img) | |
#save result_img | |
result_filepath="result_img_0.png" | |
cv2.imwrite(result_filepath, result_img) | |
return result_filepath | |
def extract_image_table(image_path): | |
# extract table information | |
response = {} | |
response = extract_table_info(image_path) | |
# Get text element from response | |
check_response(response) | |
# Extract response.content[0].text | |
json_data = extract_json(response) #type(json_data) = "dict" | |
print(f"json_data: {json_data}") | |
return json_data | |
## Extract Table Information | |
def extract_table_info(image_path): | |
# Claude | |
client = Anthropic(api_key=my_api_key) # Pass the API key here | |
MODEL_NAME = "claude-3-5-sonnet-20240620" | |
#Do ascending sort with index of value of "代碼" for all the rows in each section. If there is "X" or "x" in "代碼", treat it as "9". | |
message_list = [ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}}, | |
#{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"}, | |
{ | |
"type": "text", | |
"text": """ | |
Please extract the table information of the image, keep the context in Traditional Chinese without translation. | |
if you can not recognize the value precisely, please infer it and try to make a best guess. | |
If you can not make the best guess, please return “UNK”. | |
Create a structured set of data in json format providing key information about a table. | |
Keep the section titles in the table as a parts of json. | |
Be sure to extract the information of "代碼", and save them as part of json. | |
All the value extracted are string, including the "代碼". | |
Do not do any sort operation with all the rows. | |
Extract the text information of each cell precisely. Do not make inference between "代碼" and "項目" if you can not extract it precisely. | |
JSON fields must be labelled as: | |
Example json structure is: | |
<json> | |
{ | |
"table meta": [ | |
{"企業名稱": }, | |
{"表頭名稱": }, | |
{"報表日期": }, | |
{"幣別": }, | |
... | |
... | |
... | |
], | |
"table detail": [ | |
{ | |
... | |
... | |
... | |
}, | |
{ | |
... | |
... | |
... | |
}, | |
... | |
... | |
... | |
] | |
} | |
</json> | |
Output the json structure as a string starting with <json> and ending with </json> XML tags. | |
Do not return any narrative language. Look at the images in detail. | |
Do not insert and control code, like line feed, tab indent: "\n" | |
IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”. | |
Example: | |
<json> | |
{ | |
"table meta": [ | |
{"企業名稱": "台灣水泥股份有限公司"}, | |
{"表頭名稱": "個體資產負債表"}, | |
{"報表日期": "民國 112 年及 111 年 12 月 31 日"}, | |
{"幣別": "新台幣仟元"}, | |
... | |
... | |
... | |
], | |
"table detail": [ | |
{ | |
"資產": [ | |
{ "流動資產": | |
[ | |
{ | |
"代碼": "1100", | |
"項目": "現金及約當現金(附註四及六)", | |
"112年12月31日金額": "1,516,633", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "4,243,295", | |
"111年12月31日%": "1" | |
}, | |
{ | |
"代碼": "1110", | |
"項目": "透過損益按公允價值衡量之金融資產(附註四、七及二六)", | |
"112年12月31日金額": "341,056", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "259,919", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "1120", | |
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)", | |
"112年12月31日金額": "4,333,594", | |
"112年12月31日%": "1", | |
"111年12月31日金額": "3,607,819", | |
"111年12月31日%": "1" | |
}, | |
{ | |
"代碼": "1150", | |
"項目": "應收票據及帳款淨額(附註四及九)", | |
"112年12月31日金額": "5,801,135", | |
"112年12月31日%": "2", | |
"111年12月31日金額": "5,319,368", | |
"111年12月31日%": "1" | |
}, | |
{ | |
"代碼": "1180", | |
"項目": "應收票據及帳款-關係人(附註四及二七)", | |
"112年12月31日金額": "572,118", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "681,793", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "130X", | |
"項目": "存貨(附註四及十)", | |
"112年12月31日金額": "1,782,735", | |
"112年12月31日%": "1", | |
"111年12月31日金額": "2,321,850", | |
"111年12月31日%": "1" | |
}, | |
{ | |
"代碼": "1470", | |
"項目": "其他流動資產(附註二一及二七)", | |
"112年12月31日金額": "411,540", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "248,683", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "11XX", | |
"項目": "流動資產總計", | |
"112年12月31日金額": "14,758,811", | |
"112年12月31日%": "4", | |
"111年12月31日金額": "16,682,727", | |
"111年12月31日%": "4" | |
} | |
] | |
}, | |
{ | |
"非流動資產": [ | |
{ | |
"代碼": "1517", | |
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)", | |
"112年12月31日金額": "9,638,255", | |
"112年12月31日%": "3", | |
"111年12月31日金額": "7,633,603", | |
"111年12月31日%": "2" | |
}, | |
{ | |
"代碼": "1550", | |
"項目": "採用權益法之投資(附註四、五及十一)", | |
"112年12月31日金額": "312,351,291", | |
"112年12月31日%": "82", | |
"111年12月31日金額": "307,101,709", | |
"111年12月31日%": "82" | |
}, | |
{ | |
"代碼": "1600", | |
"項目": "不動產、廠房及設備(附註四、五、十二、十三及二八)", | |
"112年12月31日金額": "28,052,603", | |
"112年12月31日%": "7", | |
"111年12月31日金額": "35,583,596", | |
"111年12月31日%": "10" | |
}, | |
{ | |
"代碼": "1755", | |
"項目": "使用權資產(附註四、十五、二十、二七)", | |
"112年12月31日金額": "1,797,820", | |
"112年12月31日%": "1", | |
"111年12月31日金額": "1,788,972", | |
"111年12月31日%": "1" | |
}, | |
{ | |
"代碼": "1760", | |
"項目": "投資性不動產(附註四、十四及二十)", | |
"112年12月31日金額": "13,042,677", | |
"112年12月31日%": "3", | |
"111年12月31日金額": "2,436,675", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "1821", | |
"項目": "無形資產(附註四及二十)", | |
"112年12月31日金額": "58,840", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "64,956", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "1915", | |
"項目": "預付設備款", | |
"112年12月31日金額": "600,042", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "682,765", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "1975", | |
"項目": "淨確定福利資產(附註四及十八)", | |
"112年12月31日金額": "1,507,153", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "1,526,546", | |
"111年12月31日%": "-" | |
}, | |
{ | |
"代碼": "1990", | |
"項目": "其他非流動資產(附註四、六、二一及二八)", | |
"112年12月31日金額": "827,628", | |
"112年12月31日%": "-", | |
"111年12月31日金額": "840,688", | |
"111年12月31日%": "1" | |
}, | |
{ | |
"代碼": "15XX", | |
"項目": "非流動資產總計", | |
"112年12月31日金額": "367,876,309", | |
"112年12月31日%": "96", | |
"111年12月31日金額": "357,659,510", | |
"111年12月31日%": "96" | |
}] | |
}, | |
{ | |
"代碼": "1XXX", | |
"項目": "資產總計", | |
"112年12月31日金額": "382,635,120", | |
"112年12月31日%": "100", | |
"111年12月31日金額": "374,342,237", | |
"111年12月31日%": "100" | |
} | |
] | |
}, | |
{ | |
"負債": [ | |
... | |
... | |
... | |
] | |
}, | |
... | |
... | |
... | |
] | |
} | |
</json> | |
""" | |
} | |
] | |
} | |
] | |
# Update how the API is called | |
response = client.messages.create( | |
model=MODEL_NAME, | |
max_tokens=8192, # limit the amount of response information | |
messages=message_list, | |
temperature=0.7, | |
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary | |
) | |
tokens = response.usage.output_tokens | |
print(f"Generated Tokens: {tokens}") | |
#print(f"Response: {response}") | |
return response | |
## Check Response | |
def check_response(response): | |
# Check the type and content of the response | |
print(type(response.content)) | |
print(response.content) | |
# Assuming the text content is in the first element of the list | |
if isinstance(response.content, list) and response.content: | |
content_text = response.content[0].text | |
#print(json.dumps(content_text, sort_keys=True, indent=4)) | |
else: | |
print("Unexpected response format. Unable to extract text.") | |
return None | |
## Extract Json data | |
def extract_json(response): | |
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object | |
# Try to find the start and end of the JSON object more robustly | |
# skip <json> | |
json_start = response_text.find("<json>")+6 # Skip the <json> tag | |
json_end = response_text.rfind("</json>") # Include the closing brace | |
# Check if valid start and end indices were found | |
if json_start >= 0 and json_end > json_start: | |
try: | |
return json.loads(response_text[json_start:json_end]) | |
except json.JSONDecodeError as e: | |
print(f"Error decoding JSON: {e}") | |
print(f"Problematic JSON string: {response_text[json_start+1:json_end]}") | |
return {response_text[json_start+1:json_end]} | |
else: | |
print("Could not find valid JSON object in response.") | |
return | |
## Convert json to Dataframe | |
## Convert to csv | |
## Process PDF | |
def pipeline(pdf_path): | |
pages = convert_pdf_to_image(pdf_path) | |
print(f"pages: {pages}") | |
destamp_img = destamp_image("page_0.png") | |
response = {} | |
response = extract_table_info(destamp_img) | |
check_response(response) | |
json_data = extract_json(response) | |
return len(pages), destamp_img, json_data | |
## Gradio Interface | |
title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese" | |
description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR""" | |
examples = ['text_pdf.pdf', 'image_pdf.pdf'] | |
pdf_file = gr.File(label="Upload PDF", type="filepath") | |
pages = gr.File(label="Pages", type="filepath") | |
num_pages = gr.Number(label="Number of Pages") | |
destamp_img = gr.Image(type="numpy", label="De-stamped Image") | |
json_data = gr.JSON(label="JSON Data") | |
app = gr.Interface(fn=pipeline, | |
inputs=pdf_file, | |
outputs=[num_pages, destamp_img, json_data], | |
title=title, | |
description=description, | |
examples=examples) | |
app.queue() | |
app.launch(debug=True, share=True) | |
#app.launch() |