mkaramb commited on
Commit
4d57e5c
·
verified ·
1 Parent(s): 37bfbd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -16
app.py CHANGED
@@ -1,29 +1,97 @@
 
 
 
 
 
 
1
  import zipfile
2
- import gradio as gr
3
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- def unzip_file(file):
6
- extract_path = "extracted_files" # Define a path to extract files
7
- os.makedirs(extract_path, exist_ok=True) # Create the directory if it doesn't exist
8
- jpg_files = [] # List to store paths of JPG files
9
- with zipfile.ZipFile(file, "r") as zip_ref:
10
- zip_ref.extractall(extract_path) # Extract files into the specified directory
11
- # Walk through the directory structure and look for JPG files, ignoring __MACOSX directory
 
 
 
 
 
 
 
12
  for root, dirs, files in os.walk(extract_path):
13
- if '__MACOSX' in root: # Skip the __MACOSX directory
14
  continue
15
  for file in files:
16
- if file.lower().endswith('.jpg'): # Check if the file is a JPG
17
  full_path = os.path.join(root, file)
18
  jpg_files.append(full_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- if not jpg_files:
21
- return ["No JPG files found in the zip."] # Return a message if no JPGs are found
22
- return jpg_files # Return the list of JPG file paths
23
 
24
- # Define the Gradio interface, specifying image display for multiple images
25
- interface = gr.Interface(fn=unzip_file, inputs="file", outputs=gr.Gallery())
26
- interface.launch()
27
 
28
 
29
  # def greet(name):
 
1
+ import pandas as pd
2
+ from google.api_core.client_options import ClientOptions
3
+ from google.cloud import documentai_v1 as documentai
4
+ from google.cloud.documentai_v1.types import RawDocument
5
+ from google.cloud import translate_v2 as translate
6
+ from google.colab import files
7
  import zipfile
 
8
  import os
9
+ import io
10
+ import gradio as gr
11
+
12
+ # Upload credential json file from default compute service account
13
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
14
+
15
+ # Global DataFrame declaration
16
+ results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
17
+
18
+ # Set your Google Cloud Document AI processor details here
19
+ project_id = "herbaria-ai"
20
+ location = "us"
21
+ processor_id = "4307b078717a399a"
22
+
23
+ def translate_text(text, target_language="en"):
24
+ translate_client = translate.Client()
25
+ result = translate_client.translate(text, target_language=target_language)
26
+ return result["translatedText"]
27
+
28
+ def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
29
+ opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
30
+ client = documentai.DocumentProcessorServiceClient(client_options=opts)
31
+
32
+ with open(file_path, "rb") as file_stream:
33
+ raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
34
 
35
+ name = client.processor_path(project_id, location, processor_id)
36
+ request = documentai.ProcessRequest(name=name, raw_document=raw_document)
37
+ result = client.process_document(request=request)
38
+
39
+ extracted_text = result.document.text
40
+ translated_text = translate_text(extracted_text)
41
+ return extracted_text, translated_text
42
+
43
+ def unzip_and_find_jpgs(file_path):
44
+ extract_path = "extracted_files"
45
+ os.makedirs(extract_path, exist_ok=True)
46
+ jpg_files = []
47
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
48
+ zip_ref.extractall(extract_path)
49
  for root, dirs, files in os.walk(extract_path):
50
+ if '__MACOSX' in root:
51
  continue
52
  for file in files:
53
+ if file.lower().endswith('.jpg'):
54
  full_path = os.path.join(root, file)
55
  jpg_files.append(full_path)
56
+ return jpg_files
57
+
58
+ def process_images(uploaded_file):
59
+ global results_df
60
+ results_df = results_df.iloc[0:0] # Clear the DataFrame if re-running this cell
61
+
62
+ file_path = uploaded_file.name # Gradio provides the file path through the .name attribute
63
+
64
+ try:
65
+ image_files = unzip_and_find_jpgs(file_path)
66
+
67
+ if not image_files:
68
+ return "No JPG files found in the zip."
69
+
70
+ for file_path in image_files:
71
+ extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
72
+ new_row = pd.DataFrame([{
73
+ "Filename": os.path.basename(file_path),
74
+ "Extracted Text": extracted_text,
75
+ "Translated Text": translated_text
76
+ }])
77
+ results_df = pd.concat([results_df, new_row], ignore_index=True)
78
+ except Exception as e:
79
+ return f"An error occurred: {str(e)}"
80
+
81
+ return results_df.to_html()
82
+
83
+
84
+ interface = gr.Interface(
85
+ fn=process_images,
86
+ inputs="file",
87
+ outputs="html",
88
+ title="Document AI Translation",
89
+ description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image."
90
+ )
91
 
92
+ if __name__ == "__main__":
93
+ interface.launch(debug=True)
 
94
 
 
 
 
95
 
96
 
97
  # def greet(name):