|
import os |
|
|
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json" |
|
|
|
import pandas as pd |
|
from google.api_core.client_options import ClientOptions |
|
from google.cloud import documentai_v1 as documentai |
|
from google.cloud.documentai_v1.types import RawDocument |
|
from google.cloud import translate_v2 as translate |
|
import zipfile |
|
import os |
|
import io |
|
import gradio as gr |
|
|
|
|
|
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"]) |
|
|
|
|
|
project_id = "herbaria-ai" |
|
location = "us" |
|
processor_id = "4307b078717a399a" |
|
|
|
def translate_text(text, target_language="en"): |
|
translate_client = translate.Client() |
|
result = translate_client.translate(text, target_language=target_language) |
|
return result["translatedText"] |
|
|
|
def batch_process_documents(file_path: str, file_mime_type: str) -> tuple: |
|
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com") |
|
client = documentai.DocumentProcessorServiceClient(client_options=opts) |
|
|
|
with open(file_path, "rb") as file_stream: |
|
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type) |
|
|
|
name = client.processor_path(project_id, location, processor_id) |
|
request = documentai.ProcessRequest(name=name, raw_document=raw_document) |
|
result = client.process_document(request=request) |
|
|
|
extracted_text = result.document.text |
|
translated_text = translate_text(extracted_text) |
|
return extracted_text, translated_text |
|
|
|
def unzip_and_find_jpgs(file_path): |
|
extract_path = "extracted_files" |
|
os.makedirs(extract_path, exist_ok=True) |
|
jpg_files = [] |
|
with zipfile.ZipFile(file_path, 'r') as zip_ref: |
|
zip_ref.extractall(extract_path) |
|
for root, dirs, files in os.walk(extract_path): |
|
if '__MACOSX' in root: |
|
continue |
|
for file in files: |
|
if file.lower().endswith('.jpg'): |
|
full_path = os.path.join(root, file) |
|
jpg_files.append(full_path) |
|
return jpg_files |
|
|
|
def get_random_pairs_list(shots, num_pairs=2): |
|
keys = random.sample(list(shots.keys()), num_pairs) |
|
return [(key, shots[key]) for key in keys] |
|
|
|
def construct_prompt(input_text, random_pairs): |
|
|
|
prompt = \ |
|
""" |
|
Follow the examples below. Your response should contain only JSON. If you |
|
encounter two dates in an input, prefer the earliest. If the answer is not |
|
exact, try your best, but do not use excess wording. If you are completely |
|
unsure or there is no answer, insert UNKNOWN. |
|
|
|
Input 1: |
|
{random_pairs[0][0]} |
|
|
|
Output 1: |
|
{{"Collector":"{random_pairs[0][1]['Collector']}","Location":"{random_pairs[0][1]['Location']}","Taxon":"{random_pairs[0][1]['Taxon']}","Date":"{random_pairs[0][1]['Date']}"}} |
|
|
|
Input 2: |
|
{random_pairs[1][0]} |
|
|
|
Output 2: |
|
{{"Collector":"{random_pairs[1][1]['Collector']}","Location":"{random_pairs[1][1]['Location']}","Taxon":"{random_pairs[1][1]['Taxon']}","Date":"{random_pairs[1][1]['Date']}"}} |
|
|
|
Input 3: |
|
{input_text} |
|
Output 3: |
|
""" |
|
return prompt |
|
|
|
def process_responses(responses): |
|
structured_responses = [] |
|
for response in responses: |
|
try: |
|
|
|
parsed_json = json.loads(response.text) |
|
structured_responses.append(parsed_json) |
|
except json.JSONDecodeError: |
|
structured_responses.append({ |
|
"Collector": "UNKNOWN", |
|
"Location": "UNKNOWN", |
|
"Taxon": "UNKNOWN", |
|
"Date": "UNKNOWN" |
|
}) |
|
return structured_responses |
|
|
|
def process_images(uploaded_file): |
|
global results_df |
|
results_df = results_df.iloc[0:0] |
|
|
|
file_path = uploaded_file.name |
|
|
|
try: |
|
image_files = unzip_and_find_jpgs(file_path) |
|
|
|
if not image_files: |
|
return "No JPG files found in the zip." |
|
|
|
for file_path in image_files: |
|
extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg") |
|
new_row = pd.DataFrame([{ |
|
"Filename": os.path.basename(file_path), |
|
"Extracted Text": extracted_text, |
|
"Translated Text": translated_text |
|
}]) |
|
results_df = pd.concat([results_df, new_row], ignore_index=True) |
|
|
|
|
|
genai.configure(api_key='AIzaSyB9iHlqAgz5TEF36Kg_fJLJvoIDCJkqwJI') |
|
model = genai.GenerativeModel('gemini-pro') |
|
|
|
|
|
shots = \ |
|
{ |
|
"Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6663 NCIL 14 September 2017 N° 2581259 TIBET PE CHINESE NATIONAL HERBARIUM (PE) 02334125 #PE6663 COMPOSITAE Aster albescens (DC.) Hand.-Mazz. A: it (Guo-Jin ZHANG) 01 April 2018"\ |
|
:{"Collector":"Guo-Jin, Zhang", |
|
"Location":"Xizang, Tibet, China, Lhoka City, Lhozhag County, Lhakang Town, near Kharchhu Gompa", |
|
"Taxon":"Aster albescens (DC.) Hand.-Mazz., Compositae (Asteraceae) family", |
|
"Date":"14 September 2017" |
|
}, |
|
|
|
"PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang Spiral Leaf Green 17 May 2018" |
|
:{"Collector":"UNKNOWN", |
|
"Location":"Xizang, Tibet, China, Lhoka City, Lhozhag County, Lhakang Town, near Kharchhu Gompa", |
|
"Taxon":"Spiral Leaf Green", |
|
"Date":"17 May 2018" |
|
}, |
|
|
|
"Honey Plants Research Institute of the Chinese Academy of Agricultural Sciences Collection No.: 13687. May 7, 1993 Habitat Roadside Altitude: 1600 * Characters Shrub No. Herbarium of the Institute of Botany, Chinese Academy of Sciences Collector 3687 Scientific Name Height: m (cm) Diameter at breast height m (cm) Flower: White Fruit: Notes Blooming period: from January to July Honey: Scientific Name: Rosa Sericea Lindl. Appendix: Collector: cm 1 2 3 4 25 CHINESE NATIONAL HERBARUM ( 01833954 No 1479566 * Herbarium of the Institute of Botany, Chinese Academy of Sciences Sichuan SZECHUAN DET. Rosa sercea Lindl. var. Various Zhi 2009-02-16" |
|
:{"Collector":"UNKNOWN", |
|
"Location":"Sichuan, China", |
|
"Taxon":"Rosa sericea Lindl., with possible variant identification as 'var. Various Zhi'", |
|
"Date":"7 May 1993", |
|
}, |
|
} |
|
|
|
responses = [] |
|
for input_text in results_df["Translated Text"]: |
|
random_pairs = get_random_pairs_list(shots) |
|
prompt = construct_prompt(input_text, random_pairs) |
|
response = model.generate_content(prompt) |
|
responses.append(response) |
|
|
|
|
|
json_responses = process_responses(responses) |
|
results_df = pd.concat([results_df, pd.DataFrame(json_responses)], axis=1) |
|
|
|
except Exception as e: |
|
return f"An error occurred: {str(e)}" |
|
|
|
return results_df.to_html() |
|
|
|
css = """ |
|
body { font-family: Arial, sans-serif; } |
|
.input-container { width: 95%; margin: auto; } |
|
.output-container { width: 95%; margin: auto; } |
|
""" |
|
|
|
interface = gr.Interface( |
|
fn=process_images, |
|
inputs="file", |
|
outputs="html", |
|
title="Document AI Translation", |
|
description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image." |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |