|
import os |
|
|
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json" |
|
|
|
import pandas as pd |
|
from google.api_core.client_options import ClientOptions |
|
from google.cloud import documentai_v1 as documentai |
|
from google.cloud.documentai_v1.types import RawDocument |
|
from google.cloud import translate_v2 as translate |
|
import zipfile |
|
import os |
|
import io |
|
import gradio as gr |
|
|
|
|
|
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"]) |
|
|
|
|
|
project_id = "herbaria-ai" |
|
location = "us" |
|
processor_id = "4307b078717a399a" |
|
|
|
def translate_text(text, target_language="en"): |
|
translate_client = translate.Client() |
|
result = translate_client.translate(text, target_language=target_language) |
|
return result["translatedText"] |
|
|
|
def batch_process_documents(file_path: str, file_mime_type: str) -> tuple: |
|
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") |
|
client = documentai.DocumentProcessorServiceClient(client_options=opts) |
|
|
|
with open(file_path, "rb") as file_stream: |
|
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type) |
|
|
|
name = client.processor_path(project_id, location, processor_id) |
|
request = documentai.ProcessRequest(name=name, raw_document=raw_document) |
|
result = client.process_document(request=request) |
|
|
|
extracted_text = result.document.text |
|
translated_text = translate_text(extracted_text) |
|
return extracted_text, translated_text |
|
|
|
def unzip_and_find_jpgs(file_path): |
|
extract_path = "extracted_files" |
|
os.makedirs(extract_path, exist_ok=True) |
|
jpg_files = [] |
|
with zipfile.ZipFile(file_path, 'r') as zip_ref: |
|
zip_ref.extractall(extract_path) |
|
for root, dirs, files in os.walk(extract_path): |
|
if '__MACOSX' in root: |
|
continue |
|
for file in files: |
|
if file.lower().endswith('.jpg'): |
|
full_path = os.path.join(root, file) |
|
jpg_files.append(full_path) |
|
return jpg_files |
|
|
|
def process_images(uploaded_file): |
|
global results_df |
|
results_df = results_df.iloc[0:0] |
|
|
|
file_path = uploaded_file.name |
|
|
|
try: |
|
image_files = unzip_and_find_jpgs(file_path) |
|
|
|
if not image_files: |
|
return "No JPG files found in the zip." |
|
|
|
for file_path in image_files: |
|
extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg") |
|
new_row = pd.DataFrame([{ |
|
"Filename": os.path.basename(file_path), |
|
"Extracted Text": extracted_text, |
|
"Translated Text": translated_text |
|
}]) |
|
results_df = pd.concat([results_df, new_row], ignore_index=True) |
|
except Exception as e: |
|
return f"An error occurred: {str(e)}" |
|
|
|
return results_df.to_html() |
|
|
|
interface = gr.Interface( |
|
fn=process_images, |
|
inputs="file", |
|
outputs="html", |
|
title="Document AI Translation", |
|
description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image." |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch(debug=True) |