Spaces:

crystalchen
/

demo-fin-pdf-extraction

Runtime error

App Files Files Community

crystalchen commited on Aug 13, 2024

Commit

97c964c

verified ·

1 Parent(s): 538955d

Update app.py

Browse files

Files changed (1) hide show

app.py +429 -0

app.py CHANGED Viewed

	@@ -0,0 +1,429 @@

+from pdf2image import convert_from_path
+import cv2
+import numpy as np
+from google.colab.patches import cv2_imshow
+import numpy as np
+from PIL import Image
+import json
+from anthropic import Anthropic, Client
+import gradio as gr
+## Process pdf
+def convert_pdf_to_image(pdf_path):
+  # Convert PDF to images
+  pages = convert_from_path(pdf_path, dpi=400)
+  # Save images as PNG files
+  for i, page in enumerate(pages):
+      page.save(f'page_{i}.png', 'PNG')
+  print(f"Converted {len(pages)} pages to images.")
+  return pages
+## Image process Subprocess - De-stamp
+def destamp_image(img_path):
+  bgr_img = cv2.imread(img_path)
+  hsv_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2HSV)
+  # Convert the BGR image to grayscale
+  gray_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2GRAY)
+  # HSV ragne: (0-180, 0-255, 0-120)
+  # for character black color:
+  # H: 0-180,
+  # S: 0-255 ，
+  # V: 0-120 ，
+  lower_black = np.array([0,0,0])
+  upper_black = np.array([180,255,120])
+  mask = cv2.inRange(hsv_img, lower_black, upper_black)
+  deRed_img = ~mask # Single channel image
+  # imshow mask
+  #print(f"deRed_img shape: {deRed_img.shape}")
+  #show_image(deRed_img)
+  # thresholding -2
+  ret, threshold_img_2 = cv2.threshold(deRed_img, 120, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
+  #print(f' threshold-2 shape: {threshold_img_2.shape}')
+  #show_image(threshold_img_2)
+  # Desired shape: (x, y, 1)
+  new_shape = (threshold_img_2.shape[0], threshold_img_2.shape[1], 1)
+  # Resize using numpy.resize()
+  result_img = np.resize(threshold_img_2, new_shape)
+  print(f"result_img.shape: {result_img.shape}")
+  show_image(result_img)
+  #save result_img
+  result_filepath="result_img_0.png"
+  cv2.imwrite(result_filepath, result_img)
+  return result_filepath
+def extract_image_table(image_path):
+    # extract table information
+    response = {}
+    response = extract_table_info(image_path)
+    # Get text element from response
+    check_response(response)
+    # Extract response.content[0].text
+    json_data = extract_json(response)  #type(json_data) = "dict"
+    print(f"json_data: {json_data}")
+    return json_data
+## Extract Table Information
+def extract_table_info(image_path):
+  # Claude
+  client = Anthropic(api_key=my_api_key)  # Pass the API key here
+  MODEL_NAME = "claude-3-5-sonnet-20240620"
+  #Do ascending sort with index of value of "代碼" for  all the rows in each section. If there is "X" or "x" in "代碼", treat it as "9".
+  message_list = [
+      {
+        "role": "user",
+        "content": [
+          {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
+          #{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"},
+          {
+            "type": "text",
+            "text": """
+                 Please extract the table information of the image, keep the context in Traditional Chinese without translation.
+                 if you can not recognize the value precisely, please infer it and try to make a best guess.
+                 If you can not make the best guess, please return “UNK”.
+                 Create a structured set of data in json format providing key information about a table.
+                 Keep the section titles in the table as a parts of json.
+                 Be sure to extract the information of "代碼", and save them as part of json.
+                 All the value extracted are string, including the "代碼".
+                 Do not do any sort operation with all the rows.
+                 Extract the text information of each cell precisely. Do not make inference between  "代碼" and "項目" if you can not extract it precisely.
+                 JSON fields must be labelled as:
+                 Example json structure is:
+                 <json>
+                 {
+                    "table meta": [
+                      {"企業名稱":  },
+                      {"表頭名稱":  },
+                      {"報表日期":  },
+                      {"幣別":      },
+                      ...
+                      ...
+                      ...
+                    ],
+                    "table detail": [
+                        {
+                          ...
+                          ...
+                          ...
+                        },
+                        {
+                          ...
+                          ...
+                          ...
+                        },
+                        ...
+                        ...
+                        ...
+                      ]
+                 }
+                 </json>
+                 Output the json structure as a string starting with <json> and ending with </json> XML tags.
+                 Do not return any narrative language. Look at the images in detail.
+                 Do not insert and control code, like line feed, tab indent: "\n"
+                 IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”.
+                 Example:
+                 <json>
+                 {
+                    "table meta": [
+                      {"企業名稱": "台灣水泥股份有限公司"},
+                      {"表頭名稱": "個體資產負債表"},
+                      {"報表日期": "民國 112 年及 111 年 12 月 31 日"},
+                      {"幣別": "新台幣仟元"},
+                      ...
+                      ...
+                      ...
+                    ],
+                    "table detail": [
+                      {
+                        "資產": [
+                          { "流動資產":
+                            [
+                              {
+                                "代碼": "1100",
+                                "項目": "現金及約當現金（附註四及六）",
+                                "112年12月31日金額": "1,516,633",
+                                "112年12月31日%": "-",
+                                "111年12月31日金額": "4,243,295",
+                                "111年12月31日%": "1"
+                              },
+                              {
+                                "代碼": "1110",
+                                "項目": "透過損益按公允價值衡量之金融資產（附註四、七及二六）",
+                                "112年12月31日金額": "341,056",
+                                "112年12月31日%": "-",
+                                "111年12月31日金額": "259,919",
+                                "111年12月31日%": "-"
+                              },
+                              {
+                                "代碼": "1120",
+                                "項目": "透過其他綜合損益按公允價值衡量之金融資產（附註四、八及二六）",
+                                "112年12月31日金額": "4,333,594",
+                                "112年12月31日%": "1",
+                                "111年12月31日金額": "3,607,819",
+                                "111年12月31日%": "1"
+                              },
+                              {
+                                "代碼": "1150",
+                                "項目": "應收票據及帳款淨額（附註四及九）",
+                                "112年12月31日金額": "5,801,135",
+                                "112年12月31日%": "2",
+                                "111年12月31日金額": "5,319,368",
+                                "111年12月31日%": "1"
+                             },
+                              {
+                                "代碼": "1180",
+                                "項目": "應收票據及帳款－關係人（附註四及二七）",
+                                "112年12月31日金額": "572,118",
+                                "112年12月31日%": "-",
+                                "111年12月31日金額": "681,793",
+                                "111年12月31日%": "-"
+                              },
+                              {
+                                "代碼": "130X",
+                                "項目": "存貨（附註四及十）",
+                                "112年12月31日金額": "1,782,735",
+                                "112年12月31日%": "1",
+                                "111年12月31日金額": "2,321,850",
+                                "111年12月31日%": "1"
+                              },
+                              {
+                                "代碼": "1470",
+                                "項目": "其他流動資產（附註二一及二七）",
+                                "112年12月31日金額": "411,540",
+                                "112年12月31日%": "-",
+                                "111年12月31日金額": "248,683",
+                                "111年12月31日%": "-"
+                              },
+                              {
+                                "代碼": "11XX",
+                                "項目": "流動資產總計",
+                                "112年12月31日金額": "14,758,811",
+                                "112年12月31日%": "4",
+                                "111年12月31日金額": "16,682,727",
+                                "111年12月31日%": "4"
+                              }
+                            ]
+                        },
+                        {
+                          "非流動資產": [
+                          {
+                            "代碼": "1517",
+                            "項目": "透過其他綜合損益按公允價值衡量之金融資產（附註四、八及二六）",
+                            "112年12月31日金額": "9,638,255",
+                            "112年12月31日%": "3",
+                            "111年12月31日金額": "7,633,603",
+                            "111年12月31日%": "2"
+                          },
+                          {
+                            "代碼": "1550",
+                            "項目": "採用權益法之投資（附註四、五及十一）",
+                            "112年12月31日金額": "312,351,291",
+                            "112年12月31日%": "82",
+                            "111年12月31日金額": "307,101,709",
+                            "111年12月31日%": "82"
+                          },
+                          {
+                            "代碼": "1600",
+                            "項目": "不動產、廠房及設備（附註四、五、十二、十三及二八）",
+                            "112年12月31日金額": "28,052,603",
+                            "112年12月31日%": "7",
+                            "111年12月31日金額": "35,583,596",
+                            "111年12月31日%": "10"
+                          },
+                          {
+                            "代碼": "1755",
+                            "項目": "使用權資產（附註四、十五、二十、二七）",
+                            "112年12月31日金額": "1,797,820",
+                            "112年12月31日%": "1",
+                            "111年12月31日金額": "1,788,972",
+                            "111年12月31日%": "1"
+                          },
+                          {
+                            "代碼": "1760",
+                            "項目": "投資性不動產（附註四、十四及二十）",
+                            "112年12月31日金額": "13,042,677",
+                            "112年12月31日%": "3",
+                            "111年12月31日金額": "2,436,675",
+                            "111年12月31日%": "-"
+                          },
+                          {
+                            "代碼": "1821",
+                            "項目": "無形資產（附註四及二十）",
+                            "112年12月31日金額": "58,840",
+                            "112年12月31日%": "-",
+                            "111年12月31日金額": "64,956",
+                            "111年12月31日%": "-"
+                          },
+                          {
+                            "代碼": "1915",
+                            "項目": "預付設備款",
+                            "112年12月31日金額": "600,042",
+                            "112年12月31日%": "-",
+                            "111年12月31日金額": "682,765",
+                            "111年12月31日%": "-"
+                          },
+                          {
+                            "代碼": "1975",
+                            "項目": "淨確定福利資產（附註四及十八）",
+                            "112年12月31日金額": "1,507,153",
+                            "112年12月31日%": "-",
+                            "111年12月31日金額": "1,526,546",
+                            "111年12月31日%": "-"
+                          },
+                          {
+                            "代碼": "1990",
+                            "項目": "其他非流動資產（附註四、六、二一及二八）",
+                            "112年12月31日金額": "827,628",
+                            "112年12月31日%": "-",
+                            "111年12月31日金額": "840,688",
+                            "111年12月31日%": "1"
+                          },
+                          {
+                            "代碼": "15XX",
+                            "項目": "非流動資產總計",
+                            "112年12月31日金額": "367,876,309",
+                            "112年12月31日%": "96",
+                            "111年12月31日金額": "357,659,510",
+                            "111年12月31日%": "96"
+                          }]
+                        },
+                        {
+                          "代碼": "1XXX",
+                          "項目": "資產總計",
+                          "112年12月31日金額": "382,635,120",
+                          "112年12月31日%": "100",
+                          "111年12月31日金額": "374,342,237",
+                          "111年12月31日%": "100"
+                        }
+                      ]
+                      },
+                      {
+                        "負債": [
+                          ...
+                          ...
+                          ...
+                        ]
+                      },
+                      ...
+                      ...
+                      ...
+                    ]
+                  }
+                 </json>
+              """
+          }
+        ]
+      }
+  ]
+  # Update how the API is called
+  response = client.messages.create(
+      model=MODEL_NAME,
+      max_tokens=8192, # limit the amount of response information
+      messages=message_list,
+      temperature=0.7,
+      extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}  # Changed to a dictionary
+  )
+  tokens = response.usage.output_tokens
+  print(f"Generated Tokens: {tokens}")
+  #print(f"Response: {response}")
+  return response
+## Check Response
+def check_response(response):
+  # Check the type and content of the response
+  print(type(response.content))
+  print(response.content)
+  # Assuming the text content is in the first element of the list
+  if isinstance(response.content, list) and response.content:
+      content_text = response.content[0].text
+      #print(json.dumps(content_text, sort_keys=True, indent=4))
+  else:
+      print("Unexpected response format. Unable to extract text.")
+  return None
+## Extract Json data
+def extract_json(response):
+    response_text = response.content[0].text  # Access the 'text' attribute of the TextBlock object
+    # Try to find the start and end of the JSON object more robustly
+    # skip <json>
+    json_start = response_text.find("<json>")+6  # Skip the <json> tag
+    json_end = response_text.rfind("</json>")  # Include the closing brace
+    # Check if valid start and end indices were found
+    if json_start >= 0 and json_end > json_start:
+        try:
+            return json.loads(response_text[json_start:json_end])
+        except json.JSONDecodeError as e:
+            print(f"Error decoding JSON: {e}")
+            print(f"Problematic JSON string: {response_text[json_start+1:json_end]}")
+            return {response_text[json_start+1:json_end]}
+    else:
+        print("Could not find valid JSON object in response.")
+    return
+## Convert json to Dataframe
+## Convert to csv
+## Process PDF
+def pipeline(pdf_path):
+  pages = convert_pdf_to_image(pdf_path)
+  print(f"pages: {pages}")
+  destamp_img = destamp_image("page_0.png")
+  response = {}
+  response = extract_table_info(destamp_img)
+  check_response(response)
+  json_data = extract_json(response)
+  return len(pages), destamp_img, json_data
+## Gradio Interface
+title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
+description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
+examples = [['text_pdf.pdf'], ['image_pdf.pdf']]
+pdf_file = gr.File(label="Upload PDF", type="filepath")
+pages = gr.File(label="Pages", type="filepath")
+num_pages = gr.Number(label="Number of Pages")
+destamp_img = gr.Image(type="numpy", label="De-stamped Image")
+json_data = gr.JSON(label="JSON Data")
+app = gr.Interface(fn=pipeline,
+                     inputs=pdf_file,
+                     outputs=[num_pages, destamp_img, json_data],
+                     title=title,
+                     description=description,
+                     examples=examples)
+app.queue()
+app.launch(debug=True, share=True)
+#app.launch()