Spaces:

spark-ds549
/

Chinese-Label-Transcription

Sleeping

App Files Files Community

mkaramb commited on Apr 18, 2024

Commit

4d57e5c

verified ·

1 Parent(s): 37bfbd1

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -16

app.py CHANGED Viewed

@@ -1,29 +1,97 @@
 import zipfile
-import gradio as gr
 import os
-def unzip_file(file):
-    extract_path = "extracted_files"  # Define a path to extract files
-    os.makedirs(extract_path, exist_ok=True)  # Create the directory if it doesn't exist
-    jpg_files = []  # List to store paths of JPG files
-    with zipfile.ZipFile(file, "r") as zip_ref:
-        zip_ref.extractall(extract_path)  # Extract files into the specified directory
-        # Walk through the directory structure and look for JPG files, ignoring __MACOSX directory
         for root, dirs, files in os.walk(extract_path):
-            if '__MACOSX' in root:  # Skip the __MACOSX directory
                 continue
             for file in files:
-                if file.lower().endswith('.jpg'):  # Check if the file is a JPG
                     full_path = os.path.join(root, file)
                     jpg_files.append(full_path)
-    if not jpg_files:
-        return ["No JPG files found in the zip."]  # Return a message if no JPGs are found
-    return jpg_files  # Return the list of JPG file paths
-# Define the Gradio interface, specifying image display for multiple images
-interface = gr.Interface(fn=unzip_file, inputs="file", outputs=gr.Gallery())
-interface.launch()
 # def greet(name):

+import pandas as pd
+from google.api_core.client_options import ClientOptions
+from google.cloud import documentai_v1 as documentai
+from google.cloud.documentai_v1.types import RawDocument
+from google.cloud import translate_v2 as translate
+from google.colab import files
 import zipfile
 import os
+import io
+import gradio as gr
+# Upload credential json file from default compute service account
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
+# Global DataFrame declaration
+results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
+# Set your Google Cloud Document AI processor details here
+project_id = "herbaria-ai"
+location = "us"
+processor_id = "4307b078717a399a"
+def translate_text(text, target_language="en"):
+    translate_client = translate.Client()
+    result = translate_client.translate(text, target_language=target_language)
+    return result["translatedText"]
+def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
+    client = documentai.DocumentProcessorServiceClient(client_options=opts)
+    with open(file_path, "rb") as file_stream:
+        raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
+    name = client.processor_path(project_id, location, processor_id)
+    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
+    result = client.process_document(request=request)
+    extracted_text = result.document.text
+    translated_text = translate_text(extracted_text)
+    return extracted_text, translated_text
+def unzip_and_find_jpgs(file_path):
+    extract_path = "extracted_files"
+    os.makedirs(extract_path, exist_ok=True)
+    jpg_files = []
+    with zipfile.ZipFile(file_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_path)
         for root, dirs, files in os.walk(extract_path):
+            if '__MACOSX' in root:
                 continue
             for file in files:
+                if file.lower().endswith('.jpg'):
                     full_path = os.path.join(root, file)
                     jpg_files.append(full_path)
+    return jpg_files
+def process_images(uploaded_file):
+    global results_df
+    results_df = results_df.iloc[0:0]  # Clear the DataFrame if re-running this cell
+    file_path = uploaded_file.name  # Gradio provides the file path through the .name attribute
+    try:
+        image_files = unzip_and_find_jpgs(file_path)
+        if not image_files:
+            return "No JPG files found in the zip."
+        for file_path in image_files:
+            extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
+            new_row = pd.DataFrame([{
+                "Filename": os.path.basename(file_path),
+                "Extracted Text": extracted_text,
+                "Translated Text": translated_text
+            }])
+            results_df = pd.concat([results_df, new_row], ignore_index=True)
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+    return results_df.to_html()
+interface = gr.Interface(
+    fn=process_images,
+    inputs="file",
+    outputs="html",
+    title="Document AI Translation",
+    description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image."
+)
+if __name__ == "__main__":
+    interface.launch(debug=True)
 # def greet(name):