File size: 8,439 Bytes
7b79b85
b42d6ec
 
 
 
e4387b3
 
 
 
b42d6ec
 
 
 
444b42f
4d57e5c
 
 
b42d6ec
4d57e5c
 
 
 
 
b42d6ec
4d57e5c
 
 
 
b42d6ec
eea98e7
b42d6ec
4d57e5c
 
eb822d4
4d57e5c
 
 
b42d6ec
4d57e5c
 
 
 
 
 
 
 
 
 
7b79b85
4d57e5c
37bfbd1
7b79b85
4d57e5c
7b79b85
c8197d8
4d57e5c
 
58e491e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d57e5c
 
b42d6ec
 
 
4d57e5c
 
 
b42d6ec
4d57e5c
 
 
 
 
 
 
 
 
 
 
58e491e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d57e5c
 
 
 
 
58e491e
 
 
 
 
 
b42d6ec
4d57e5c
b42d6ec
4d57e5c
 
b42d6ec
4d57e5c
7b79b85
4d57e5c
58e491e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
# Upload credential json file from default compute service account
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io
import gradio as gr

# Global DataFrame declaration
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "4307b078717a399a"

def translate_text(text, target_language="en"):
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language)
    return result["translatedText"]

def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    with open(file_path, "rb") as file_stream:
        raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

    name = client.processor_path(project_id, location, processor_id)
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    result = client.process_document(request=request)

    extracted_text = result.document.text
    translated_text = translate_text(extracted_text)
    return extracted_text, translated_text

def unzip_and_find_jpgs(file_path):
    extract_path = "extracted_files"
    os.makedirs(extract_path, exist_ok=True)
    jpg_files = []
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        for root, dirs, files in os.walk(extract_path):
            if '__MACOSX' in root:
                continue
            for file in files:
                if file.lower().endswith('.jpg'):
                    full_path = os.path.join(root, file)
                    jpg_files.append(full_path)
    return jpg_files

def get_random_pairs_list(shots, num_pairs=2):
    keys = random.sample(list(shots.keys()), num_pairs)
    return [(key, shots[key]) for key in keys]

def construct_prompt(input_text, random_pairs):
    # Example setup based on your specified format
    prompt = \
    """    
    Follow the examples below. Your response should contain only JSON. If you
    encounter two dates in an input, prefer the earliest. If the answer is not
    exact, try your best, but do not use excess wording. If you are completely
    unsure or there is no answer, insert UNKNOWN.

    Input 1:
    {random_pairs[0][0]}

    Output 1:
    {{"Collector":"{random_pairs[0][1]['Collector']}","Location":"{random_pairs[0][1]['Location']}","Taxon":"{random_pairs[0][1]['Taxon']}","Date":"{random_pairs[0][1]['Date']}"}}

    Input 2:
    {random_pairs[1][0]}

    Output 2:
    {{"Collector":"{random_pairs[1][1]['Collector']}","Location":"{random_pairs[1][1]['Location']}","Taxon":"{random_pairs[1][1]['Taxon']}","Date":"{random_pairs[1][1]['Date']}"}}

    Input 3:
    {input_text}
    Output 3:
    """
    return prompt

def process_responses(responses):
    structured_responses = []
    for response in responses:
        try:
            # Assuming response is a string of JSON data
            parsed_json = json.loads(response.text)
            structured_responses.append(parsed_json)
        except json.JSONDecodeError:
            structured_responses.append({
                "Collector": "UNKNOWN",
                "Location": "UNKNOWN",
                "Taxon": "UNKNOWN",
                "Date": "UNKNOWN"
            })
    return structured_responses

def process_images(uploaded_file):
    global results_df
    results_df = results_df.iloc[0:0]  # Clear the DataFrame if re-running this cell

    file_path = uploaded_file.name  # Gradio provides the file path through the .name attribute

    try:
        image_files = unzip_and_find_jpgs(file_path)

        if not image_files:
            return "No JPG files found in the zip."

        for file_path in image_files:
            extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
            new_row = pd.DataFrame([{
                "Filename": os.path.basename(file_path),
                "Extracted Text": extracted_text,
                "Translated Text": translated_text
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)

        # Configure the generative AI model
        genai.configure(api_key='AIzaSyB9iHlqAgz5TEF36Kg_fJLJvoIDCJkqwJI')
        model = genai.GenerativeModel('gemini-pro')

        # Prepare data for few-shot learning
        shots = \
        {
            "Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6663 NCIL 14 September 2017 N° 2581259 TIBET PE CHINESE NATIONAL HERBARIUM (PE) 02334125 #PE6663 COMPOSITAE Aster albescens (DC.) Hand.-Mazz. A: it (Guo-Jin ZHANG) 01 April 2018"\
            :{"Collector":"Guo-Jin, Zhang",
              "Location":"Xizang, Tibet, China, Lhoka City, Lhozhag County, Lhakang Town, near Kharchhu Gompa",
              "Taxon":"Aster albescens (DC.) Hand.-Mazz., Compositae (Asteraceae) family",
              "Date":"14 September 2017"
            },

            "PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang Spiral Leaf Green 17 May 2018"
            :{"Collector":"UNKNOWN",
              "Location":"Xizang, Tibet, China, Lhoka City, Lhozhag County, Lhakang Town, near Kharchhu Gompa",
              "Taxon":"Spiral Leaf Green",
              "Date":"17 May 2018"
            },

            "Honey Plants Research Institute of the Chinese Academy of Agricultural Sciences Collection No.: 13687. May 7, 1993 Habitat Roadside Altitude: 1600 * Characters Shrub No. Herbarium of the Institute of Botany, Chinese Academy of Sciences Collector 3687 Scientific Name Height: m (cm) Diameter at breast height m (cm) Flower: White Fruit: Notes Blooming period: from January to July Honey: Scientific Name: Rosa Sericea Lindl. Appendix: Collector: cm 1 2 3 4 25 CHINESE NATIONAL HERBARUM ( 01833954 No 1479566 * Herbarium of the Institute of Botany, Chinese Academy of Sciences Sichuan SZECHUAN DET. Rosa sercea Lindl. var. Various Zhi 2009-02-16"
            :{"Collector":"UNKNOWN",
              "Location":"Sichuan, China",
              "Taxon":"Rosa sericea Lindl., with possible variant identification as 'var. Various Zhi'",
              "Date":"7 May 1993",
            },
        }

        responses = []
        for input_text in results_df["Translated Text"]:
            random_pairs = get_random_pairs_list(shots)
            prompt = construct_prompt(input_text, random_pairs)
            response = model.generate_content(prompt)
            responses.append(response)

        # Processing responses
        json_responses = process_responses(responses)
        results_df = pd.concat([results_df, pd.DataFrame(json_responses)], axis=1)

    except Exception as e:
        return f"An error occurred: {str(e)}"

    return results_df.to_html()

css = """
    body { font-family: Arial, sans-serif; }
    .input-container { width: 95%; margin: auto; }
    .output-container { width: 95%; margin: auto; }
"""

interface = gr.Interface(
    fn=process_images,
    inputs="file",
    outputs="html",
    title="Document AI Translation",
    description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image."
)

if __name__ == "__main__":
    interface.launch(share=True)