Spaces:

phyloforfun
/

VoucherVision

Running

App Files Files Community

phyloforfun commited on Feb 19, 2024

Commit

ae215ea

1 Parent(s): 37a138a

Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing

Browse files

Files changed (19) hide show

app.py +82 -29
run_VoucherVision.py +2 -2
vouchervision/API_validation.py +5 -8
vouchervision/LLM_GoogleGemini.py +23 -23
vouchervision/LLM_GooglePalm2.py +10 -11
vouchervision/LLM_MistralAI.py +10 -11
vouchervision/LLM_OpenAI.py +14 -10
vouchervision/LLM_local_MistralAI.py +10 -11
vouchervision/LLM_local_cpu_MistralAI.py +10 -11
vouchervision/OCR_Gemini.py +3 -3
vouchervision/OCR_google_cloud_vision.py +48 -276
vouchervision/OCR_llava.py +9 -9
vouchervision/VoucherVision_Config_Builder.py +28 -7
vouchervision/model_maps.py +1 -1
vouchervision/tool_geolocate_HERE.py +321 -0
vouchervision/tool_taxonomy_WFO.py +324 -0
vouchervision/tool_wikipedia.py +51 -41
vouchervision/utils_LLM.py +64 -0
vouchervision/utils_VoucherVision.py +38 -25

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 from io import BytesIO
 from streamlit_extras.let_it_rain import rain
 from annotated_text import annotated_text
 from vouchervision.LeafMachine2_Config_Builder import write_config_file
 from vouchervision.VoucherVision_Config_Builder import build_VV_config, TestOptionsGPT, TestOptionsPalm, check_if_usable
@@ -999,7 +1000,8 @@ def create_private_file():
         st.write("API keys are stored in `../VoucherVision/PRIVATE_DATA.yaml`.")
         st.write("Deleting this file will allow you to reset API keys. Alternatively, you can edit the keys in the user interface or by manually editing the `.yaml` file in a text editor.")
         st.write("Leave keys blank if you do not intend to use that service.")
         st.write("---")
         st.subheader("Google Vision  (*Required*) / Google PaLM 2 / Google Gemini")
         st.markdown("VoucherVision currently uses [Google Vision API](https://cloud.google.com/vision/docs/ocr) for OCR. Generating an API key for this is more involved than the others. [Please carefully follow the instructions outlined here to create and setup your account.](https://cloud.google.com/vision/docs/setup) ")
@@ -1008,46 +1010,46 @@ def create_private_file():
         with st.expander("**View Google API Instructions**"):
             blog_text_and_image(text="Select your project, then in the search bar, search for `vertex ai` and select the option in the photo below.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_00.png'))
             blog_text_and_image(text="On the main overview page, click `Enable All Recommended APIs`. Sometimes this button may be hidden. In that case, enable all of the suggested APIs listed on this page.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_0.png'))
             blog_text_and_image(text="Sometimes this button may be hidden. In that case, enable all of the suggested APIs listed on this page.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_2.png'))
             blog_text_and_image(text="Make sure that all APIs are enabled.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_1.png'))
             blog_text_and_image(text="Find the `Vision AI API` service and go to its page.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_3.png'))
             blog_text_and_image(text="Find the `Vision AI API` service and go to its page. This is the API service required to use OCR in VoucherVision and must be enabled.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_6.png'))
             blog_text_and_image(text="You can also search for the Vertex AI Vision service.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_4.png'))
             blog_text_and_image(text=None,
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_5.png'))
             st.subheader("Getting a Google JSON authentication key")
             st.write("Google uses a JSON file to store additional authentication information. Save this file in a safe, private location and assign the `GOOGLE_APPLICATION_CREDENTIALS` value to the file path. For Hugging Face, copy the contents of the JSON file including the `\{\}` and paste it as the secret value.")
             st.write("To download your JSON key...")
             blog_text_and_image(text="Open the navigation menu. Click on the hamburger menu (three horizontal lines) in the top left corner. Go to IAM & Admin. ",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_7.png'),width=300)
             blog_text_and_image(text="In the navigation pane, hover over `IAM & Admin` and then click on `Service accounts`.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_8.png'))
             blog_text_and_image(text="Find the default Compute Engine service account, select it.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_9.png'))
             blog_text_and_image(text="Click `Add Key`.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_10.png'))
             blog_text_and_image(text="Select `JSON` and click create. This will download your key. Store this in a safe location. The file path to this safe location is the value that you enter into the `GOOGLE_APPLICATION_CREDENTIALS` value.",
-                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_11.png'))
             blog_text(text_bold="Store Safely", text=": This file contains sensitive data that can be used to authenticate and bill your Google Cloud account. Never commit it to public repositories or expose it in any way. Always keep it safe and secure.")
@@ -1135,21 +1137,24 @@ def create_private_file():
         st.write("---")
         st.subheader("HERE Geocoding")
         st.markdown('Follow these [instructions](https://platform.here.com/sign-up?step=verify-identity) to generate an API key for HERE.')
-        hre_APP_ID = st.text_input("HERE Geocoding App ID", cfg_private['here'].get('APP_ID', ''),
                                                  help='e.g. a 32-character string',
                                                  placeholder='e.g. SATgthsykuE64FgrrrrEervr3S4455t_geyDeGq',
                                                  type='password')
-        hre_API_KEY = st.text_input("HERE Geocoding API Key", cfg_private['here'].get('API_KEY', ''),
                                                  help='e.g. a 32-character string',
                                                  placeholder='e.g. SATgthsykuE64FgrrrrEervr3S4455t_geyDeGq',
                                                  type='password')
-        st.button("Set API Keys",type='primary', on_click=save_changes_to_API_keys, args=[cfg_private,openai_api_key,azure_openai_api_version,azure_openai_api_key,
-                                                                    azure_openai_api_base,azure_openai_organization,azure_openai_api_type,
-                                                                    google_application_credentials, google_project_location, google_project_id,
-                                                                    mistral_API_KEY, hre_APP_ID, hre_API_KEY])
         if st.button('Proceed to VoucherVision'):
             st.session_state.private_file = does_private_file_exist()
             st.session_state.proceed_to_private = False
@@ -1157,10 +1162,12 @@ def create_private_file():
             st.rerun()
-def save_changes_to_API_keys(cfg_private,openai_api_key,azure_openai_api_version,azure_openai_api_key,
-                            azure_openai_api_base,azure_openai_organization,azure_openai_api_type,
-                            google_application_credentials, google_project_location, google_project_id,
-                            mistral_API_KEY, hre_APP_ID, hre_API_KEY):
     # Update the configuration dictionary with the new values
     cfg_private['openai']['OPENAI_API_KEY'] = openai_api_key
@@ -1172,15 +1179,16 @@ def save_changes_to_API_keys(cfg_private,openai_api_key,azure_openai_api_version
     cfg_private['openai_azure']['OPENAI_API_TYPE'] = azure_openai_api_type
     cfg_private['google']['GOOGLE_APPLICATION_CREDENTIALS'] = google_application_credentials
-    cfg_private['google']['GOOGLE_PROJECT_ID'] = google_project_location
-    cfg_private['google']['GOOGLE_LOCATION'] = google_project_id
     cfg_private['mistral']['MISTRAL_API_KEY'] = mistral_API_KEY
-    cfg_private['here']['APP_ID'] = hre_APP_ID
-    cfg_private['here']['API_KEY'] = hre_API_KEY
     # Call the function to write the updated configuration to the YAML file
     write_config_file(cfg_private, st.session_state.dir_home, filename="PRIVATE_DATA.yaml")
     # st.session_state.private_file = does_private_file_exist()
 # Function to load a YAML file and update session_state
@@ -1568,6 +1576,25 @@ def content_project_settings(col):
             st.session_state.config['leafmachine']['project']['dir_output'] = st.text_input("Output directory", st.session_state.config['leafmachine']['project'].get('dir_output', ''))
 def content_llm_cost():
     st.write("---")
     st.header('LLM Cost Calculator')
@@ -1855,6 +1882,17 @@ def content_ocr_method():
         do_use_trOCR = st.checkbox("Enable trOCR", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
         st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
     if 'LLaVA' in selected_OCR_options:
         OCR_option_llava = st.radio(
             "Select the LLaVA version",
@@ -1888,6 +1926,15 @@ def content_ocr_method():
     # elif (OCR_option == 'hand') and do_use_trOCR:
     #     st.text_area(label='Handwritten/Printed + trOCR',placeholder=demo_text_trh,disabled=True, label_visibility='visible', height=150)
 @st.cache_data
 def show_collage():
     # Load the image only if it's not already in the session state
@@ -1920,7 +1967,12 @@ def content_collage_overlay():
         st.info("NOTE: We strongly recommend enabling LeafMachine2 cropping if your images are full sized herbarium sheet. Often, the OCR algorithm struggles with full sheets, but works well with the collage images. We have disabled the collage by default for this Hugging Face Space because the Space lacks a GPU and the collage creation takes a bit longer.")
         default_crops = st.session_state.config['leafmachine']['cropped_components']['save_cropped_annotations']
         st.markdown("Prior to transcription, use LeafMachine2 to crop all labels from input images to create label collages for each specimen image. Showing just the text labels to the OCR algorithms significantly improves performance. This runs slowly on the free Hugging Face Space, but runs quickly with a fast CPU or any GPU.")
-        st.session_state.config['leafmachine']['use_RGB_label_images'] = st.checkbox(":rainbow[Use LeafMachine2 label collage for transcriptions]", st.session_state.config['leafmachine'].get('use_RGB_label_images', False))
         option_selected_crops = st.multiselect(label="Components to crop",
@@ -2247,6 +2299,7 @@ def main():
     content_ocr_method()
     content_collage_overlay()
     content_llm_cost()
     content_processing_options()
     content_less_used()

 from io import BytesIO
 from streamlit_extras.let_it_rain import rain
 from annotated_text import annotated_text
+from transformers import AutoConfig
 from vouchervision.LeafMachine2_Config_Builder import write_config_file
 from vouchervision.VoucherVision_Config_Builder import build_VV_config, TestOptionsGPT, TestOptionsPalm, check_if_usable
         st.write("API keys are stored in `../VoucherVision/PRIVATE_DATA.yaml`.")
         st.write("Deleting this file will allow you to reset API keys. Alternatively, you can edit the keys in the user interface or by manually editing the `.yaml` file in a text editor.")
         st.write("Leave keys blank if you do not intend to use that service.")
+        st.info("Note: You can manually edit these API keys later by opening the /PRIVATE_DATA.yaml file in a plain text editor.")
         st.write("---")
         st.subheader("Google Vision  (*Required*) / Google PaLM 2 / Google Gemini")
         st.markdown("VoucherVision currently uses [Google Vision API](https://cloud.google.com/vision/docs/ocr) for OCR. Generating an API key for this is more involved than the others. [Please carefully follow the instructions outlined here to create and setup your account.](https://cloud.google.com/vision/docs/setup) ")
         with st.expander("**View Google API Instructions**"):
             blog_text_and_image(text="Select your project, then in the search bar, search for `vertex ai` and select the option in the photo below.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_00.PNG'))
             blog_text_and_image(text="On the main overview page, click `Enable All Recommended APIs`. Sometimes this button may be hidden. In that case, enable all of the suggested APIs listed on this page.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_0.PNG'))
             blog_text_and_image(text="Sometimes this button may be hidden. In that case, enable all of the suggested APIs listed on this page.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_2.PNG'))
             blog_text_and_image(text="Make sure that all APIs are enabled.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_1.PNG'))
             blog_text_and_image(text="Find the `Vision AI API` service and go to its page.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_3.PNG'))
             blog_text_and_image(text="Find the `Vision AI API` service and go to its page. This is the API service required to use OCR in VoucherVision and must be enabled.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_6.PNG'))
             blog_text_and_image(text="You can also search for the Vertex AI Vision service.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_4.PNG'))
             blog_text_and_image(text=None,
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_5.PNG'))
             st.subheader("Getting a Google JSON authentication key")
             st.write("Google uses a JSON file to store additional authentication information. Save this file in a safe, private location and assign the `GOOGLE_APPLICATION_CREDENTIALS` value to the file path. For Hugging Face, copy the contents of the JSON file including the `\{\}` and paste it as the secret value.")
             st.write("To download your JSON key...")
             blog_text_and_image(text="Open the navigation menu. Click on the hamburger menu (three horizontal lines) in the top left corner. Go to IAM & Admin. ",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_7.PNG'),width=300)
             blog_text_and_image(text="In the navigation pane, hover over `IAM & Admin` and then click on `Service accounts`.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_8.PNG'))
             blog_text_and_image(text="Find the default Compute Engine service account, select it.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_9.PNG'))
             blog_text_and_image(text="Click `Add Key`.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_10.PNG'))
             blog_text_and_image(text="Select `JSON` and click create. This will download your key. Store this in a safe location. The file path to this safe location is the value that you enter into the `GOOGLE_APPLICATION_CREDENTIALS` value.",
+                                fullpath=os.path.join(st.session_state.dir_home, 'demo','google','google_api_11.PNG'))
             blog_text(text_bold="Store Safely", text=": This file contains sensitive data that can be used to authenticate and bill your Google Cloud account. Never commit it to public repositories or expose it in any way. Always keep it safe and secure.")
         st.write("---")
         st.subheader("HERE Geocoding")
         st.markdown('Follow these [instructions](https://platform.here.com/sign-up?step=verify-identity) to generate an API key for HERE.')
+        here_APP_ID = st.text_input("HERE Geocoding App ID", cfg_private['here'].get('APP_ID', ''),
                                                  help='e.g. a 32-character string',
                                                  placeholder='e.g. SATgthsykuE64FgrrrrEervr3S4455t_geyDeGq',
                                                  type='password')
+        here_API_KEY = st.text_input("HERE Geocoding API Key", cfg_private['here'].get('API_KEY', ''),
                                                  help='e.g. a 32-character string',
                                                  placeholder='e.g. SATgthsykuE64FgrrrrEervr3S4455t_geyDeGq',
                                                  type='password')
+        st.button("Set API Keys",type='primary', on_click=save_changes_to_API_keys,
+                    args=[cfg_private,
+                        openai_api_key,
+                        azure_openai_api_version, azure_openai_api_key, azure_openai_api_base, azure_openai_organization, azure_openai_api_type,
+                        google_application_credentials, google_project_location, google_project_id,
+                        mistral_API_KEY,
+                        here_APP_ID, here_API_KEY])
         if st.button('Proceed to VoucherVision'):
             st.session_state.private_file = does_private_file_exist()
             st.session_state.proceed_to_private = False
             st.rerun()
+def save_changes_to_API_keys(cfg_private,
+                        openai_api_key,
+                        azure_openai_api_version, azure_openai_api_key, azure_openai_api_base, azure_openai_organization, azure_openai_api_type,
+                        google_application_credentials, google_project_location, google_project_id,
+                        mistral_API_KEY,
+                        here_APP_ID, here_API_KEY):
     # Update the configuration dictionary with the new values
     cfg_private['openai']['OPENAI_API_KEY'] = openai_api_key
     cfg_private['openai_azure']['OPENAI_API_TYPE'] = azure_openai_api_type
     cfg_private['google']['GOOGLE_APPLICATION_CREDENTIALS'] = google_application_credentials
+    cfg_private['google']['GOOGLE_PROJECT_ID'] = google_project_id
+    cfg_private['google']['GOOGLE_LOCATION'] = google_project_location
     cfg_private['mistral']['MISTRAL_API_KEY'] = mistral_API_KEY
+    cfg_private['here']['APP_ID'] = here_APP_ID
+    cfg_private['here']['API_KEY'] = here_API_KEY
     # Call the function to write the updated configuration to the YAML file
     write_config_file(cfg_private, st.session_state.dir_home, filename="PRIVATE_DATA.yaml")
+    st.success(f"API Keys saved to {os.path.join(st.session_state.dir_home, 'PRIVATE_DATA.yaml')}")
     # st.session_state.private_file = does_private_file_exist()
 # Function to load a YAML file and update session_state
             st.session_state.config['leafmachine']['project']['dir_output'] = st.text_input("Output directory", st.session_state.config['leafmachine']['project'].get('dir_output', ''))
+def content_tools():
+    st.write("---")
+    st.header('Validation Tools')
+    tool_WFO = st.session_state.config['leafmachine']['project']['tool_WFO']
+    st.session_state.config['leafmachine']['project']['tool_WFO'] = st.checkbox(label="Enable World Flora Online taxonomy verification",
+                                                                                      help="",
+                                                                                      value=tool_WFO)
+    tool_GEO = st.session_state.config['leafmachine']['project']['tool_GEO']
+    st.session_state.config['leafmachine']['project']['tool_GEO'] = st.checkbox(label="Enable HERE geolocation hints",
+                                                                                      help="",
+                                                                                      value=tool_GEO)
+    tool_wikipedia = st.session_state.config['leafmachine']['project']['tool_wikipedia']
+    st.session_state.config['leafmachine']['project']['tool_wikipedia'] = st.checkbox(label="Enable Wikipedia verification",
+                                                                                      help="",
+                                                                                      value=tool_wikipedia)
 def content_llm_cost():
     st.write("---")
     st.header('LLM Cost Calculator')
         do_use_trOCR = st.checkbox("Enable trOCR", value=st.session_state.config['leafmachine']['project']['do_use_trOCR'],key="Enable trOCR2")#,disabled=st.session_state['lacks_GPU'])
         st.session_state.config['leafmachine']['project']['do_use_trOCR'] = do_use_trOCR
+    if do_use_trOCR:
+        # st.session_state.config['leafmachine']['project']['trOCR_model_path'] = "microsoft/trocr-large-handwritten"
+        default_trOCR_model_path = st.session_state.config['leafmachine']['project']['trOCR_model_path']
+        user_input_trOCR_model_path = st.text_input("trOCR Hugging Face model path. MUST be a fine-tuned version of 'microsoft/trocr-base-handwritten' or 'microsoft/trocr-large-handwritten', or a microsoft trOCR model.", value=default_trOCR_model_path)
+        if st.session_state.config['leafmachine']['project']['trOCR_model_path'] != user_input_trOCR_model_path:
+            is_valid_mp = is_valid_huggingface_model_path(user_input_trOCR_model_path)
+            if not is_valid_mp:
+                st.error(f"The Hugging Face model path {user_input_trOCR_model_path} is not valid. Please revise.")
+            else:
+                st.session_state.config['leafmachine']['project']['trOCR_model_path'] = user_input_trOCR_model_path
     if 'LLaVA' in selected_OCR_options:
         OCR_option_llava = st.radio(
             "Select the LLaVA version",
     # elif (OCR_option == 'hand') and do_use_trOCR:
     #     st.text_area(label='Handwritten/Printed + trOCR',placeholder=demo_text_trh,disabled=True, label_visibility='visible', height=150)
+def is_valid_huggingface_model_path(model_path):
+    try:
+        # Attempt to load the model configuration from Hugging Face Model Hub
+        config = AutoConfig.from_pretrained(model_path)
+        return True  # If the configuration loads successfully, the model path is valid
+    except Exception as e:
+        # If loading the model configuration fails, the model path is not valid
+        return False
 @st.cache_data
 def show_collage():
     # Load the image only if it's not already in the session state
         st.info("NOTE: We strongly recommend enabling LeafMachine2 cropping if your images are full sized herbarium sheet. Often, the OCR algorithm struggles with full sheets, but works well with the collage images. We have disabled the collage by default for this Hugging Face Space because the Space lacks a GPU and the collage creation takes a bit longer.")
         default_crops = st.session_state.config['leafmachine']['cropped_components']['save_cropped_annotations']
         st.markdown("Prior to transcription, use LeafMachine2 to crop all labels from input images to create label collages for each specimen image. Showing just the text labels to the OCR algorithms significantly improves performance. This runs slowly on the free Hugging Face Space, but runs quickly with a fast CPU or any GPU.")
+        st.markdown("Images that are mostly text (like a scanned notecard, or already cropped images) do not require LM2 collage.")
+        if st.session_state.is_hf:
+            st.session_state.config['leafmachine']['use_RGB_label_images'] = st.checkbox(":rainbow[Use LeafMachine2 label collage for transcriptions]", st.session_state.config['leafmachine'].get('use_RGB_label_images', False), key='do make collage hf')
+        else:
+            st.session_state.config['leafmachine']['use_RGB_label_images'] = st.checkbox(":rainbow[Use LeafMachine2 label collage for transcriptions]", st.session_state.config['leafmachine'].get('use_RGB_label_images', True), key='do make collage local')
         option_selected_crops = st.multiselect(label="Components to crop",
     content_ocr_method()
     content_collage_overlay()
+    content_tools()
     content_llm_cost()
     content_processing_options()
     content_less_used()

run_VoucherVision.py CHANGED Viewed

@@ -31,7 +31,7 @@ def resolve_path(path):
 if __name__ == "__main__":
     dir_home = os.path.dirname(__file__)
-    start_port = 8529
     try:
         free_port = find_available_port(start_port)
         sys.argv = [
@@ -41,7 +41,7 @@ if __name__ == "__main__":
             # resolve_path(os.path.join(dir_home,"vouchervision", "VoucherVision_GUI.py")),
             "--global.developmentMode=false",
             # "--server.port=8545",
-            "--server.port=8546",
             # Toggle below for HF vs Local
             # "--is_hf=1",
             # "--is_hf=0",

 if __name__ == "__main__":
     dir_home = os.path.dirname(__file__)
+    start_port = 8528
     try:
         free_port = find_available_port(start_port)
         sys.argv = [
             # resolve_path(os.path.join(dir_home,"vouchervision", "VoucherVision_GUI.py")),
             "--global.developmentMode=false",
             # "--server.port=8545",
+            f"--server.port={free_port}",
             # Toggle below for HF vs Local
             # "--is_hf=1",
             # "--is_hf=0",

vouchervision/API_validation.py CHANGED Viewed

@@ -36,10 +36,11 @@ class APIvalidation:
     def has_API_key(self, val):
-        if val:
-            return True
-        else:
-            return False
     def check_openai_api_key(self):
         if self.is_hf:
@@ -192,10 +193,6 @@ class APIvalidation:
             print(f"palm2 fail2")
         try:
-            # https://python.langchain.com/docs/integrations/llms/google_vertex_ai_palm
-            # os.environ['GOOGLE_API_KEY'] = "AIzaSyAHOH1w1qV7C3jS4W7QFyoaTGUwZIgS5ig"
-            # genai.configure(api_key='AIzaSyC8xvu6t9fb5dTah3hpgg_rwwR5G5kianI')
-            # model = ChatGoogleGenerativeAI(model="text-bison@001")
             model = VertexAI(model="text-bison@001", max_output_tokens=10)
             response = model.predict("Hello")
             test_response_palm2 = response

     def has_API_key(self, val):
+        return isinstance(val, str) and bool(val.strip())
+        # if val:
+        #     return True
+        # else:
+        #     return False
     def check_openai_api_key(self):
         if self.is_hf:
             print(f"palm2 fail2")
         try:
             model = VertexAI(model="text-bison@001", max_output_tokens=10)
             response = model.predict("Hello")
             test_response_palm2 = response

vouchervision/LLM_GoogleGemini.py CHANGED Viewed

@@ -6,14 +6,11 @@ from langchain.output_parsers import RetryWithErrorOutputParser
 # from langchain.schema import HumanMessage
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
-# from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
-from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
-from vouchervision.utils_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import WikipediaLinks
 class GoogleGeminiHandler:
@@ -23,7 +20,12 @@ class GoogleGeminiHandler:
     VENDOR = 'google'
     STARTING_TEMP = 0.5
-    def __init__(self, logger, model_name, JSON_dict_structure):
         self.logger = logger
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
@@ -76,13 +78,13 @@ class GoogleGeminiHandler:
     def _build_model_chain_parser(self):
         # Instantiate the LLM class for Google Gemini
-        # self.llm_model = ChatGoogleGenerativeAI(model='gemini-pro',
-        #                             max_output_tokens=self.config.get('max_output_tokens'),
-        #                             top_p=self.config.get('top_p'))
-        self.llm_model = VertexAI(model='gemini-pro',
-                                  max_output_tokens=self.config.get('max_output_tokens'),
-                                  top_p=self.config.get('top_p'))
         # Set up the retry parser with the runnable
         self.retry_parser = RetryWithErrorOutputParser.from_llm(parser=self.parser, llm=self.llm_model, max_retries=self.MAX_RETRIES)
         # Prepare the chain
@@ -90,10 +92,10 @@ class GoogleGeminiHandler:
     # Define a function to format the input for Google Gemini call
     def call_google_gemini(self, prompt_text):
-        model = GenerativeModel(self.model_name)
-        response = model.generate_content(prompt_text.text,
-                                        generation_config=self.config,
-                                        safety_settings=self.safety_settings)
         return response.text
     def call_llm_api_GoogleGemini(self, prompt_template, json_report, paths):
@@ -130,13 +132,9 @@ class GoogleGeminiHandler:
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        Wiki = WikipediaLinks(json_file_path_wiki)
-                        Wiki.gather_wikipedia_results(output)
-                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -156,6 +154,8 @@ class GoogleGeminiHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

 # from langchain.schema import HumanMessage
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
+from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
+from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 class GoogleGeminiHandler:
     VENDOR = 'google'
     STARTING_TEMP = 0.5
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
+        self.cfg = cfg
+        self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
+        self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
+        self.tool_wikipedia = self.cfg['leafmachine']['project']['tool_wikipedia']
         self.logger = logger
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
     def _build_model_chain_parser(self):
         # Instantiate the LLM class for Google Gemini
+        self.llm_model = ChatGoogleGenerativeAI(model=self.model_name)#,
+                                    # max_output_tokens=self.config.get('max_output_tokens'),
+                                    # top_p=self.config.get('top_p'))
+        # self.llm_model = VertexAI(model='gemini-1.0-pro',
+        #                           max_output_tokens=self.config.get('max_output_tokens'),
+        #                           top_p=self.config.get('top_p'))
         # Set up the retry parser with the runnable
         self.retry_parser = RetryWithErrorOutputParser.from_llm(parser=self.parser, llm=self.llm_model, max_retries=self.MAX_RETRIES)
         # Prepare the chain
     # Define a function to format the input for Google Gemini call
     def call_google_gemini(self, prompt_text):
+        model = GenerativeModel(self.model_name)#,
+                                        # generation_config=self.config,
+                                        # safety_settings=self.safety_settings)
+        response = model.generate_content(prompt_text.text)
         return response.text
     def call_llm_api_GoogleGemini(self, prompt_template, json_report, paths):
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
+                        save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

vouchervision/LLM_GooglePalm2.py CHANGED Viewed

@@ -11,11 +11,8 @@ from langchain_core.output_parsers import JsonOutputParser
 # from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
-from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
-from vouchervision.utils_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import WikipediaLinks
 #https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk
 #pip install --upgrade google-cloud-aiplatform
@@ -34,7 +31,12 @@ class GooglePalm2Handler:
     VENDOR = 'google'
     STARTING_TEMP = 0.5
-    def __init__(self, logger, model_name, JSON_dict_structure):
         self.logger = logger
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
@@ -144,13 +146,9 @@ class GooglePalm2Handler:
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        Wiki = WikipediaLinks(json_file_path_wiki)
-                        Wiki.gather_wikipedia_results(output)
-                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -171,6 +169,7 @@ class GooglePalm2Handler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

 # from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
+from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 #https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk
 #pip install --upgrade google-cloud-aiplatform
     VENDOR = 'google'
     STARTING_TEMP = 0.5
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
+        self.cfg = cfg
+        self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
+        self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
+        self.tool_wikipedia = self.cfg['leafmachine']['project']['tool_wikipedia']
         self.logger = logger
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
+                        save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

vouchervision/LLM_MistralAI.py CHANGED Viewed

@@ -4,11 +4,8 @@ from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
-from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
-from vouchervision.utils_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import WikipediaLinks
 class MistralHandler:
@@ -19,7 +16,12 @@ class MistralHandler:
     VENDOR = 'mistral'
     RANDOM_SEED = 2023
-    def __init__(self, logger, model_name, JSON_dict_structure):
         self.logger = logger
         self.monitor = SystemLoadMonitor(logger)
         self.has_GPU = torch.cuda.is_available()
@@ -115,13 +117,9 @@ class MistralHandler:
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        Wiki = WikipediaLinks(json_file_path_wiki)
-                        Wiki.gather_wikipedia_results(output)
-                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -142,6 +140,7 @@ class MistralHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')

 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
+from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 class MistralHandler:
     VENDOR = 'mistral'
     RANDOM_SEED = 2023
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
+        self.cfg = cfg
+        self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
+        self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
+        self.tool_wikipedia = self.cfg['leafmachine']['project']['tool_wikipedia']
         self.logger = logger
         self.monitor = SystemLoadMonitor(logger)
         self.has_GPU = torch.cuda.is_available()
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
+                        save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')

vouchervision/LLM_OpenAI.py CHANGED Viewed

@@ -5,11 +5,8 @@ from langchain.schema import HumanMessage
 from langchain_core.output_parsers import JsonOutputParser
 from langchain.output_parsers import RetryWithErrorOutputParser
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
-from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
-from vouchervision.utils_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import WikipediaLinks
 class OpenAIHandler:
     RETRY_DELAY = 10  # Wait 10 seconds before retrying
@@ -18,7 +15,12 @@ class OpenAIHandler:
     TOKENIZER_NAME = 'gpt-4'
     VENDOR = 'openai'
-    def __init__(self, logger, model_name, JSON_dict_structure, is_azure, llm_object):
         self.logger = logger
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
@@ -135,13 +137,14 @@ class OpenAIHandler:
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        Wiki = WikipediaLinks(json_file_path_wiki)
-                        Wiki.gather_wikipedia_results(output)
-                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -162,6 +165,7 @@ class OpenAIHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

 from langchain_core.output_parsers import JsonOutputParser
 from langchain.output_parsers import RetryWithErrorOutputParser
+from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 class OpenAIHandler:
     RETRY_DELAY = 10  # Wait 10 seconds before retrying
     TOKENIZER_NAME = 'gpt-4'
     VENDOR = 'openai'
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object):
+        self.cfg = cfg
+        self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
+        self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
+        self.tool_wikipedia = self.cfg['leafmachine']['project']['tool_wikipedia']
         self.logger = logger
         self.model_name = model_name
         self.JSON_dict_structure = JSON_dict_structure
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
+                        # output1, WFO_record = validate_taxonomy_WFO(self.tool_WFO, output, replace_if_success_wfo=False)
+                        # output2, GEO_record = validate_coordinates_here(self.tool_GEO, output, replace_if_success_geo=False)
+                        # validate_wikipedia(self.tool_wikipedia, json_file_path_wiki, output)
+                        save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

vouchervision/LLM_local_MistralAI.py CHANGED Viewed

@@ -6,11 +6,8 @@ from langchain_core.output_parsers import JsonOutputParser
 from huggingface_hub import hf_hub_download
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
-from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
-from vouchervision.utils_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import WikipediaLinks
 '''
 Local Pipielines:
@@ -25,7 +22,12 @@ class LocalMistralHandler:
     VENDOR = 'mistral'
     MAX_GPU_MONITORING_INTERVAL = 2  # seconds
-    def __init__(self, logger, model_name, JSON_dict_structure):
         self.logger = logger
         self.has_GPU = torch.cuda.is_available()
         self.monitor = SystemLoadMonitor(logger)
@@ -188,13 +190,9 @@ class LocalMistralHandler:
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        Wiki = WikipediaLinks(json_file_path_wiki)
-                        Wiki.gather_wikipedia_results(output)
-                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -214,6 +212,7 @@ class LocalMistralHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         usage_report = self.monitor.stop_monitoring_report_usage()
         json_report.set_text(text_main=f'LLM call failed')

 from huggingface_hub import hf_hub_download
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 '''
 Local Pipielines:
     VENDOR = 'mistral'
     MAX_GPU_MONITORING_INTERVAL = 2  # seconds
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
+        self.cfg = cfg
+        self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
+        self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
+        self.tool_wikipedia = self.cfg['leafmachine']['project']['tool_wikipedia']
         self.logger = logger
         self.has_GPU = torch.cuda.is_available()
         self.monitor = SystemLoadMonitor(logger)
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
+                        save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         json_report.set_text(text_main=f'LLM call failed')

vouchervision/LLM_local_cpu_MistralAI.py CHANGED Viewed

@@ -18,11 +18,8 @@ from langchain.callbacks.base import BaseCallbackHandler
 from huggingface_hub import hf_hub_download
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
-from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
-from vouchervision.utils_geolocate_HERE import validate_coordinates_here
-from vouchervision.tool_wikipedia import WikipediaLinks
 class LocalCPUMistralHandler:
     RETRY_DELAY = 2  # Wait 2 seconds before retrying
@@ -33,7 +30,12 @@ class LocalCPUMistralHandler:
     SEED = 2023
-    def __init__(self, logger, model_name, JSON_dict_structure):
         self.logger = logger
         self.monitor = SystemLoadMonitor(logger)
         self.has_GPU = torch.cuda.is_available()
@@ -179,13 +181,9 @@ class LocalCPUMistralHandler:
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        Wiki = WikipediaLinks(json_file_path_wiki)
-                        Wiki.gather_wikipedia_results(output)
-                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
@@ -204,6 +202,7 @@ class LocalCPUMistralHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

 from huggingface_hub import hf_hub_download
+from vouchervision.utils_LLM import SystemLoadMonitor, run_tools, count_tokens, save_individual_prompt, sanitize_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 class LocalCPUMistralHandler:
     RETRY_DELAY = 2  # Wait 2 seconds before retrying
     SEED = 2023
+    def __init__(self, cfg, logger, model_name, JSON_dict_structure):
+        self.cfg = cfg
+        self.tool_WFO = self.cfg['leafmachine']['project']['tool_WFO']
+        self.tool_GEO = self.cfg['leafmachine']['project']['tool_GEO']
+        self.tool_wikipedia = self.cfg['leafmachine']['project']['tool_wikipedia']
         self.logger = logger
         self.monitor = SystemLoadMonitor(logger)
         self.has_GPU = torch.cuda.is_available()
                         self.monitor.stop_inference_timer() # Starts tool timer too
                         json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output_WFO, WFO_record, output_GEO, GEO_record = run_tools(output, self.tool_WFO, self.tool_GEO, self.tool_wikipedia, json_file_path_wiki)
+                        save_individual_prompt(sanitize_prompt(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        self.monitor.stop_inference_timer() # Starts tool timer too
         usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()

vouchervision/OCR_Gemini.py CHANGED Viewed

@@ -145,16 +145,16 @@ maximumElevationInMeters
 }
   """
 def _get_google_credentials():
-    with open('D:/Dropbox/Servers/google_API/vouchervision-hf-a2c361d5d29d.json', 'r') as file:
         data = json.load(file)
         creds_json_str = json.dumps(data)
         credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
         os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = creds_json_str
-        os.environ['GOOGLE_API_KEY'] = 'AIzaSyAHOH1w1qV7C3jS4W7QFyoaTGUwZIgS5ig'
         return credentials
 if __name__ == '__main__':
-    vertexai.init(project='vouchervision-hf', location='us-central1', credentials=_get_google_credentials())
     logger = logging.getLogger('LLaVA')
     logger.setLevel(logging.DEBUG)

 }
   """
 def _get_google_credentials():
+    with open('', 'r') as file:
         data = json.load(file)
         creds_json_str = json.dumps(data)
         credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
         os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = creds_json_str
+        os.environ['GOOGLE_API_KEY'] = ''
         return credentials
 if __name__ == '__main__':
+    vertexai.init(project='', location='', credentials=_get_google_credentials())
     logger = logging.getLogger('LLaVA')
     logger.setLevel(logging.DEBUG)

vouchervision/OCR_google_cloud_vision.py CHANGED Viewed

@@ -10,14 +10,6 @@ from google.oauth2 import service_account
 ### LLaVA should only be installed if the user will actually use it.
 ### It requires the most recent pytorch/Python and can mess with older systems
-try:
-    from craft_text_detector import read_image, load_craftnet_model, load_refinenet_model, get_prediction, export_detected_regions, export_extra_results, empty_cuda_cache
-except:
-    pass
-try:
-    from OCR_llava import OCRllava
-except:
-    pass
 '''
@@ -92,9 +84,7 @@ class OCREngine:
         self.multimodal_prompt = """I need you to transcribe all of the text in this image.
         Place the transcribed text into a JSON dictionary with this form {"Transcription_Printed_Text": "text","Transcription_Handwritten_Text": "text"}"""
-        if 'LLaVA' in self.OCR_option:
-            self.init_llava()
     def set_client(self):
@@ -113,6 +103,8 @@ class OCREngine:
     def init_craft(self):
         if 'CRAFT' in self.OCR_option:
             try:
                 self.refine_net = load_refinenet_model(cuda=True)
                 self.use_cuda = True
@@ -126,21 +118,23 @@ class OCREngine:
                 self.craft_net = load_craftnet_model(weight_path=os.path.join(self.dir_home,'vouchervision','craft','craft_mlt_25k.pth'), cuda=False)
     def init_llava(self):
-        self.model_path = "liuhaotian/" + self.cfg['leafmachine']['project']['OCR_option_llava']
-        self.model_quant = self.cfg['leafmachine']['project']['OCR_option_llava_bit']
-        self.json_report.set_text(text_main=f'Loading LLaVA model: {self.model_path} Quantization: {self.model_quant}')
-        if self.model_quant == '4bit':
-            use_4bit = True
-        elif self.model_quant == 'full':
-            use_4bit = False
-        else:
-            self.logger.info(f"Provided model quantization invlid. Using 4bit.")
-            use_4bit = True
-        self.Llava = OCRllava(self.logger, model_path=self.model_path, load_in_4bit=use_4bit, load_in_8bit=False)
     def init_gemini_vision(self):
         pass
@@ -150,6 +144,8 @@ class OCREngine:
     def detect_text_craft(self):
         # Perform prediction using CRAFT
         image = read_image(self.path)
@@ -250,13 +246,13 @@ class OCREngine:
         if not do_use_trOCR:
             if 'normal' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text
-                logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}")
                 # ocr_parts = ocr_parts + f"Google_OCR_Standard:\n{self.normal_organized_text}"
                 ocr_parts = self.normal_organized_text
             if 'hand' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text
-                logger.info(f"Google_OCR_Handwriting:\n{self.hand_organized_text}")
                 # ocr_parts = ocr_parts +  f"Google_OCR_Handwriting:\n{self.hand_organized_text}"
                 ocr_parts = self.hand_organized_text
@@ -340,13 +336,13 @@ class OCREngine:
             if 'normal' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text
                 self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts
-                logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}\n\ntrOCR:\n{self.trOCR_texts}")
                 # ocr_parts = ocr_parts +  f"\nGoogle_OCR_Standard:\n{self.normal_organized_text}\n\ntrOCR:\n{self.trOCR_texts}"
                 ocr_parts = self.trOCR_texts
             if 'hand' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text
                 self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts
-                logger.info(f"Google_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}")
                 # ocr_parts = ocr_parts +  f"\nGoogle_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}"
                 ocr_parts = self.trOCR_texts
             # if self.OCR_option in ['both',]:
@@ -358,7 +354,7 @@ class OCREngine:
             if 'CRAFT' in self.OCR_option:
                 # self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text
                 self.OCR_JSON_to_file['OCR_CRAFT_trOCR'] = self.trOCR_texts
-                logger.info(f"CRAFT_trOCR:\n{self.trOCR_texts}")
                 # ocr_parts = ocr_parts +  f"\nCRAFT_trOCR:\n{self.trOCR_texts}"
                 ocr_parts = self.trOCR_texts
             return ocr_parts
@@ -383,7 +379,10 @@ class OCREngine:
         for bound, confidence, char_height, character in zip(bounds_flat, confidences, heights, characters):
             font_size = int(char_height)
-            font = ImageFont.load_default().font_variant(size=font_size)
             if option == 'trOCR':
                 color = (0, 170, 255)
             else:
@@ -686,7 +685,7 @@ class OCREngine:
                 self.OCR = self.OCR + part_OCR + part_OCR
             else:
                 self.OCR = self.OCR + "\CRAFT trOCR:\n" + self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger)
-            logger.info(f"CRAFT trOCR:\n{self.OCR}")
         if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
             self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} transcription :construction:')
@@ -704,25 +703,34 @@ class OCREngine:
                 self.OCR = self.OCR + f"\nLLaVA OCR:\n{str_output}" + f"\nLLaVA OCR:\n{str_output}"
             else:
                 self.OCR = self.OCR + f"\nLLaVA OCR:\n{str_output}"
-            logger.info(f"LLaVA OCR:\n{self.OCR}")
         if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
             if 'normal' in self.OCR_option:
-                self.OCR = self.OCR + "\nGoogle Printed OCR:\n" + self.detect_text()
             if 'hand' in self.OCR_option:
-                self.OCR = self.OCR + "\nGoogle Handwritten OCR:\n" + self.detect_handwritten_ocr()
             # if self.OCR_option not in ['normal', 'hand', 'both']:
             #     self.OCR_option = 'both'
             #     self.detect_text()
             #     self.detect_handwritten_ocr()
             ### Optionally add trOCR to the self.OCR for additional context
-            if self.double_OCR:
-                part_OCR = "\ntrOCR:\n" + self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger)
-                self.OCR = self.OCR + part_OCR + part_OCR
-            else:
-                self.OCR = self.OCR + "\ntrOCR:\n" + self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger)
-            logger.info(f"OCR:\n{self.OCR}")
         if do_create_OCR_helper_image and ('LLaVA' not in self.OCR_option):
             self.image = Image.open(self.path)
@@ -744,8 +752,6 @@ class OCREngine:
                 image_with_boxes_normal = self.draw_boxes('normal')
                 self.merged_image_normal = self.merge_images(image_with_boxes_normal, text_image_trOCR)
             ### Merge final overlay image
             ### [original, normal bboxes, normal text]
             if 'CRAFT' in self.OCR_option or 'normal' in self.OCR_option:
@@ -773,241 +779,7 @@ class OCREngine:
             self.overlay_image = Image.open(self.path)
         try:
             empty_cuda_cache()
         except:
-            pass
-'''
-BBOX_COLOR = "black" # green cyan
-def render_text_on_black_image(image_path, handwritten_char_bounds_flat, handwritten_char_confidences, handwritten_char_heights, characters):
-    # Load the original image to get its dimensions
-    original_image = Image.open(image_path)
-    width, height = original_image.size
-    # Create a black image of the same size
-    black_image = Image.new("RGB", (width, height), "black")
-    draw = ImageDraw.Draw(black_image)
-    # Loop through each character
-    for bound, confidence, char_height, character in zip(handwritten_char_bounds_flat, handwritten_char_confidences, handwritten_char_heights, characters):
-        # Determine the font size based on the height of the character
-        font_size = int(char_height)
-        font = ImageFont.load_default().font_variant(size=font_size)
-        # Color of the character
-        color = confidence_to_color(confidence)
-        # Position of the text (using the bottom-left corner of the bounding box)
-        position = (bound["vertices"][0]["x"], bound["vertices"][0]["y"] - char_height)
-        # Draw the character
-        draw.text(position, character, fill=color, font=font)
-    return black_image
-def merge_images(image1, image2):
-    # Assuming both images are of the same size
-    width, height = image1.size
-    merged_image = Image.new("RGB", (width * 2, height))
-    merged_image.paste(image1, (0, 0))
-    merged_image.paste(image2, (width, 0))
-    return merged_image
-def draw_boxes(image, bounds, color):
-    if bounds:
-        draw = ImageDraw.Draw(image)
-        width, height = image.size
-        line_width = int((width + height) / 2 * 0.001)  # This sets the line width as 0.5% of the average dimension
-        for bound in bounds:
-            draw.polygon(
-                [
-                    bound["vertices"][0]["x"], bound["vertices"][0]["y"],
-                    bound["vertices"][1]["x"], bound["vertices"][1]["y"],
-                    bound["vertices"][2]["x"], bound["vertices"][2]["y"],
-                    bound["vertices"][3]["x"], bound["vertices"][3]["y"],
-                ],
-                outline=color,
-                width=line_width
-            )
-    return image
-def detect_text(path):
-    client = vision.ImageAnnotatorClient()
-    with io.open(path, 'rb') as image_file:
-        content = image_file.read()
-    image = vision.Image(content=content)
-    response = client.document_text_detection(image=image)
-    texts = response.text_annotations
-    if response.error.message:
-        raise Exception(
-            '{}\nFor more info on error messages, check: '
-            'https://cloud.google.com/apis/design/errors'.format(
-                response.error.message))
-    # Extract bounding boxes
-    bounds = []
-    text_to_box_mapping = {}
-    for text in texts[1:]:  # Skip the first entry, as it represents the entire detected text
-        # Convert BoundingPoly to dictionary
-        bound_dict = {
-            "vertices": [
-                {"x": vertex.x, "y": vertex.y} for vertex in text.bounding_poly.vertices
-            ]
-        }
-        bounds.append(bound_dict)
-        text_to_box_mapping[str(bound_dict)] = text.description
-    if texts:
-        # cleaned_text = texts[0].description.replace("\n", " ").replace("\t", " ").replace("|", " ")
-        cleaned_text = texts[0].description
-        return cleaned_text, bounds, text_to_box_mapping
-    else:
-        return '', None, None
-def confidence_to_color(confidence):
-    """Convert confidence level to a color ranging from red (low confidence) to green (high confidence)."""
-    # Using HSL color space, where Hue varies from red to green
-    hue = (confidence - 0.5) * 120 / 0.5  # Scale confidence to range 0-120 (red to green in HSL)
-    r, g, b = colorsys.hls_to_rgb(hue/360, 0.5, 1)  # Convert to RGB
-    return (int(r*255), int(g*255), int(b*255))
-def overlay_boxes_on_image(path, typed_bounds, handwritten_char_bounds, handwritten_char_confidences, do_create_OCR_helper_image):
-    if do_create_OCR_helper_image:
-        image = Image.open(path)
-        draw = ImageDraw.Draw(image)
-        width, height = image.size
-        line_width = int((width + height) / 2 * 0.005)  # Adjust line width for character level
-        # Draw boxes for typed text
-        for bound in typed_bounds:
-            draw.polygon(
-                [
-                    bound["vertices"][0]["x"], bound["vertices"][0]["y"],
-                    bound["vertices"][1]["x"], bound["vertices"][1]["y"],
-                    bound["vertices"][2]["x"], bound["vertices"][2]["y"],
-                    bound["vertices"][3]["x"], bound["vertices"][3]["y"],
-                ],
-                outline=BBOX_COLOR,
-                width=1
-            )
-        # Draw a line segment at the bottom of each handwritten character
-        for bound, confidence in zip(handwritten_char_bounds, handwritten_char_confidences):
-            color = confidence_to_color(confidence)
-            # Use the bottom two vertices of the bounding box for the line
-            bottom_left = (bound["vertices"][3]["x"], bound["vertices"][3]["y"] + line_width)
-            bottom_right = (bound["vertices"][2]["x"], bound["vertices"][2]["y"] + line_width)
-            draw.line([bottom_left, bottom_right], fill=color, width=line_width)
-        text_image = render_text_on_black_image(path, handwritten_char_bounds, handwritten_char_confidences)
-        merged_image = merge_images(image, text_image)  # Assuming 'overlayed_image' is the image with lines
-        return merged_image
-    else:
-        return Image.open(path)
-def detect_handwritten_ocr(path):
-    """Detects handwritten characters in a local image and returns their bounding boxes and confidence levels.
-    Args:
-    path: The path to the local file.
-    Returns:
-    A tuple of (text, bounding_boxes, confidences)
-    """
-    client = vision_beta.ImageAnnotatorClient()
-    with open(path, "rb") as image_file:
-        content = image_file.read()
-    image = vision_beta.Image(content=content)
-    image_context = vision_beta.ImageContext(language_hints=["en-t-i0-handwrit"])
-    response = client.document_text_detection(image=image, image_context=image_context)
-    if response.error.message:
-        raise Exception(
-            "{}\nFor more info on error messages, check: "
-            "https://cloud.google.com/apis/design/errors".format(response.error.message)
-        )
-    bounds = []
-    bounds_flat = []
-    height_flat = []
-    confidences = []
-    character = []
-    for page in response.full_text_annotation.pages:
-        for block in page.blocks:
-            for paragraph in block.paragraphs:
-                for word in paragraph.words:
-                    # Get the bottom Y-location (max Y) for the whole word
-                    Y = max(vertex.y for vertex in word.bounding_box.vertices)
-                    # Get the height of the word's bounding box
-                    H = Y - min(vertex.y for vertex in word.bounding_box.vertices)
-                    for symbol in word.symbols:
-                        # Collecting bounding box for each symbol
-                        bound_dict = {
-                            "vertices": [
-                                {"x": vertex.x, "y": vertex.y} for vertex in symbol.bounding_box.vertices
-                            ]
-                        }
-                        bounds.append(bound_dict)
-                        # Bounds with same bottom y height
-                        bounds_flat_dict = {
-                            "vertices": [
-                                {"x": vertex.x, "y": Y} for vertex in symbol.bounding_box.vertices
-                            ]
-                        }
-                        bounds_flat.append(bounds_flat_dict)
-                        # Add the word's height
-                        height_flat.append(H)
-                        # Collecting confidence for each symbol
-                        symbol_confidence = round(symbol.confidence, 4)
-                        confidences.append(symbol_confidence)
-                        character.append(symbol.text)
-    cleaned_text = response.full_text_annotation.text
-    return cleaned_text, bounds, bounds_flat, height_flat, confidences, character
-def process_image(path, do_create_OCR_helper_image):
-    typed_text, typed_bounds, _ = detect_text(path)
-    handwritten_text, handwritten_bounds, _ = detect_handwritten_ocr(path)
-    overlayed_image = overlay_boxes_on_image(path, typed_bounds, handwritten_bounds, do_create_OCR_helper_image)
-    return typed_text, handwritten_text, overlayed_image
-'''
-# ''' Google Vision'''
-# def detect_text(path):
-#     """Detects text in the file located in the local filesystem."""
-#     client = vision.ImageAnnotatorClient()
-#     with io.open(path, 'rb') as image_file:
-#         content = image_file.read()
-#     image = vision.Image(content=content)
-#     response = client.document_text_detection(image=image)
-#     texts = response.text_annotations
-#     if response.error.message:
-#         raise Exception(
-#             '{}\nFor more info on error messages, check: '
-#             'https://cloud.google.com/apis/design/errors'.format(
-#                 response.error.message))
-#     return texts[0].description if texts else ''

 ### LLaVA should only be installed if the user will actually use it.
 ### It requires the most recent pytorch/Python and can mess with older systems
 '''
         self.multimodal_prompt = """I need you to transcribe all of the text in this image.
         Place the transcribed text into a JSON dictionary with this form {"Transcription_Printed_Text": "text","Transcription_Handwritten_Text": "text"}"""
+        self.init_llava()
     def set_client(self):
     def init_craft(self):
         if 'CRAFT' in self.OCR_option:
+            from craft_text_detector import load_craftnet_model, load_refinenet_model
             try:
                 self.refine_net = load_refinenet_model(cuda=True)
                 self.use_cuda = True
                 self.craft_net = load_craftnet_model(weight_path=os.path.join(self.dir_home,'vouchervision','craft','craft_mlt_25k.pth'), cuda=False)
     def init_llava(self):
+        if 'LLaVA' in self.OCR_option:
+            from vouchervision.OCR_llava import OCRllava
+            self.model_path = "liuhaotian/" + self.cfg['leafmachine']['project']['OCR_option_llava']
+            self.model_quant = self.cfg['leafmachine']['project']['OCR_option_llava_bit']
+            self.json_report.set_text(text_main=f'Loading LLaVA model: {self.model_path} Quantization: {self.model_quant}')
+            if self.model_quant == '4bit':
+                use_4bit = True
+            elif self.model_quant == 'full':
+                use_4bit = False
+            else:
+                self.logger.info(f"Provided model quantization invlid. Using 4bit.")
+                use_4bit = True
+            self.Llava = OCRllava(self.logger, model_path=self.model_path, load_in_4bit=use_4bit, load_in_8bit=False)
     def init_gemini_vision(self):
         pass
     def detect_text_craft(self):
+        from craft_text_detector import read_image, get_prediction
         # Perform prediction using CRAFT
         image = read_image(self.path)
         if not do_use_trOCR:
             if 'normal' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text
+                # logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}")
                 # ocr_parts = ocr_parts + f"Google_OCR_Standard:\n{self.normal_organized_text}"
                 ocr_parts = self.normal_organized_text
             if 'hand' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text
+                # logger.info(f"Google_OCR_Handwriting:\n{self.hand_organized_text}")
                 # ocr_parts = ocr_parts +  f"Google_OCR_Handwriting:\n{self.hand_organized_text}"
                 ocr_parts = self.hand_organized_text
             if 'normal' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text
                 self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts
+                # logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}\n\ntrOCR:\n{self.trOCR_texts}")
                 # ocr_parts = ocr_parts +  f"\nGoogle_OCR_Standard:\n{self.normal_organized_text}\n\ntrOCR:\n{self.trOCR_texts}"
                 ocr_parts = self.trOCR_texts
             if 'hand' in self.OCR_option:
                 self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text
                 self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts
+                # logger.info(f"Google_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}")
                 # ocr_parts = ocr_parts +  f"\nGoogle_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}"
                 ocr_parts = self.trOCR_texts
             # if self.OCR_option in ['both',]:
             if 'CRAFT' in self.OCR_option:
                 # self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text
                 self.OCR_JSON_to_file['OCR_CRAFT_trOCR'] = self.trOCR_texts
+                # logger.info(f"CRAFT_trOCR:\n{self.trOCR_texts}")
                 # ocr_parts = ocr_parts +  f"\nCRAFT_trOCR:\n{self.trOCR_texts}"
                 ocr_parts = self.trOCR_texts
             return ocr_parts
         for bound, confidence, char_height, character in zip(bounds_flat, confidences, heights, characters):
             font_size = int(char_height)
+            try:
+                font = ImageFont.truetype("arial.ttf", font_size)
+            except:
+                font = ImageFont.load_default().font_variant(size=font_size)
             if option == 'trOCR':
                 color = (0, 170, 255)
             else:
                 self.OCR = self.OCR + part_OCR + part_OCR
             else:
                 self.OCR = self.OCR + "\CRAFT trOCR:\n" + self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger)
+            # logger.info(f"CRAFT trOCR:\n{self.OCR}")
         if 'LLaVA' in self.OCR_option: # This option does not produce an OCR helper image
             self.json_report.set_text(text_main=f'Working on LLaVA {self.Llava.model_path} transcription :construction:')
                 self.OCR = self.OCR + f"\nLLaVA OCR:\n{str_output}" + f"\nLLaVA OCR:\n{str_output}"
             else:
                 self.OCR = self.OCR + f"\nLLaVA OCR:\n{str_output}"
+            # logger.info(f"LLaVA OCR:\n{self.OCR}")
         if 'normal' in self.OCR_option or 'hand' in self.OCR_option:
             if 'normal' in self.OCR_option:
+                if self.double_OCR:
+                    part_OCR = self.OCR + "\nGoogle Printed OCR:\n" + self.detect_text()
+                    self.OCR = self.OCR + part_OCR + part_OCR
+                else:
+                    self.OCR = self.OCR + "\nGoogle Printed OCR:\n" + self.detect_text()
             if 'hand' in self.OCR_option:
+                if self.double_OCR:
+                    part_OCR = self.OCR + "\nGoogle Handwritten OCR:\n" + self.detect_handwritten_ocr()
+                    self.OCR = self.OCR + part_OCR + part_OCR
+                else:
+                    self.OCR = self.OCR + "\nGoogle Handwritten OCR:\n" + self.detect_handwritten_ocr()
             # if self.OCR_option not in ['normal', 'hand', 'both']:
             #     self.OCR_option = 'both'
             #     self.detect_text()
             #     self.detect_handwritten_ocr()
             ### Optionally add trOCR to the self.OCR for additional context
+            if self.do_use_trOCR:
+                if self.double_OCR:
+                    part_OCR = "\ntrOCR:\n" + self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger)
+                    self.OCR = self.OCR + part_OCR + part_OCR
+                else:
+                    self.OCR = self.OCR + "\ntrOCR:\n" + self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger)
+            # logger.info(f"OCR:\n{self.OCR}")
         if do_create_OCR_helper_image and ('LLaVA' not in self.OCR_option):
             self.image = Image.open(self.path)
                 image_with_boxes_normal = self.draw_boxes('normal')
                 self.merged_image_normal = self.merge_images(image_with_boxes_normal, text_image_trOCR)
             ### Merge final overlay image
             ### [original, normal bboxes, normal text]
             if 'CRAFT' in self.OCR_option or 'normal' in self.OCR_option:
             self.overlay_image = Image.open(self.path)
         try:
+            from craft_text_detector import empty_cuda_cache
             empty_cuda_cache()
         except:
+            pass

vouchervision/OCR_llava.py CHANGED Viewed

@@ -3,20 +3,20 @@ import requests
 from PIL import Image
 from io import BytesIO
 import torch
-from transformers import AutoTokenizer, BitsAndBytesConfig, TextStreamer
-from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.pydantic_v1 import BaseModel, Field
-from LLaVA.llava.model import LlavaLlamaForCausalLM
-from LLaVA.llava.model.builder import load_pretrained_model
-from LLaVA.llava.conversation import conv_templates, SeparatorStyle
-from LLaVA.llava.utils import disable_torch_init
-from LLaVA.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_PLACEHOLDER
-from LLaVA.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images
-from utils_LLM import SystemLoadMonitor
 '''
 Performance expectations system:

 from PIL import Image
 from io import BytesIO
 import torch
+# from transformers import AutoTokenizer, BitsAndBytesConfig, TextStreamer
+# from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.pydantic_v1 import BaseModel, Field
+# from vouchervision.LLaVA.llava.model import LlavaLlamaForCausalLM
+from vouchervision.LLaVA.llava.model.builder import load_pretrained_model
+from vouchervision.LLaVA.llava.conversation import conv_templates#, SeparatorStyle
+from vouchervision.LLaVA.llava.utils import disable_torch_init
+from vouchervision.LLaVA.llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_PLACEHOLDER
+from vouchervision.LLaVA.llava.mm_utils import tokenizer_image_token, get_model_name_from_path, process_images #KeywordsStoppingCriteria
+from vouchervision.utils_LLM import SystemLoadMonitor
 '''
 Performance expectations system:

vouchervision/VoucherVision_Config_Builder.py CHANGED Viewed

@@ -36,16 +36,22 @@ def build_VV_config(loaded_cfg=None):
         save_cropped_annotations = ['label','barcode']
         do_use_trOCR = False
         OCR_option = 'hand'
         OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
         OCR_option_llava_bit = 'full' # full or 4bit
         double_OCR = False
         check_for_illegal_filenames = False
         LLM_version_user = 'Azure GPT 3.5 Instruct' #'Azure GPT 4 Turbo 1106-preview'
-        prompt_version = 'version_5.yaml' # from ["Version 1", "Version 1 No Domain Knowledge", "Version 2"]
-        use_LeafMachine2_collage_images = False # Use LeafMachine2 collage images
         do_create_OCR_helper_image = True
         batch_size = 500
@@ -54,8 +60,8 @@ def build_VV_config(loaded_cfg=None):
         skip_vertical = False
         pdf_conversion_dpi = 100
-        path_domain_knowledge = os.path.join(dir_home,'domain_knowledge','SLTP_UM_AllAsiaMinimalInRegion.xlsx')
-        embeddings_database_name = os.path.splitext(os.path.basename(path_domain_knowledge))[0]
         #############################################
         #############################################
@@ -65,7 +71,9 @@ def build_VV_config(loaded_cfg=None):
         return assemble_config(dir_home, run_name, dir_images_local,dir_output,
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
-                        prompt_version, do_create_OCR_helper_image, do_use_trOCR, OCR_option, OCR_option_llava, OCR_option_llava_bit, double_OCR, save_cropped_annotations,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
     else:
         dir_home = os.path.dirname(os.path.dirname(__file__))
@@ -80,11 +88,16 @@ def build_VV_config(loaded_cfg=None):
         catalog_numerical_only = loaded_cfg['leafmachine']['project']['catalog_numerical_only']
         do_use_trOCR = loaded_cfg['leafmachine']['project']['do_use_trOCR']
         OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
         OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
         OCR_option_llava_bit  = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
         double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
         pdf_conversion_dpi = loaded_cfg['leafmachine']['project']['pdf_conversion_dpi']
         LLM_version_user = loaded_cfg['leafmachine']['LLM_version']
@@ -105,14 +118,18 @@ def build_VV_config(loaded_cfg=None):
         return assemble_config(dir_home, run_name, dir_images_local,dir_output,
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
-                        prompt_version, do_create_OCR_helper_image, do_use_trOCR, OCR_option, OCR_option_llava, OCR_option_llava_bit, double_OCR, save_cropped_annotations,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
 def assemble_config(dir_home, run_name, dir_images_local,dir_output,
                     prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                     path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
-                    prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, OCR_option, OCR_option_llava, OCR_option_llava_bit, double_OCR, save_cropped_annotations,
                     check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
@@ -157,11 +174,15 @@ def assemble_config(dir_home, run_name, dir_images_local,dir_output,
         'delete_all_temps': False,
         'delete_temps_keep_VVE': False,
         'do_use_trOCR': do_use_trOCR,
         'OCR_option': OCR_option,
         'OCR_option_llava': OCR_option_llava,
         'OCR_option_llava_bit': OCR_option_llava_bit,
         'double_OCR': double_OCR,
         'pdf_conversion_dpi': pdf_conversion_dpi,
     }
     modules_section = {

         save_cropped_annotations = ['label','barcode']
         do_use_trOCR = False
+        trOCR_model_path = "microsoft/trocr-large-handwritten"
         OCR_option = 'hand'
         OCR_option_llava = 'llava-v1.6-mistral-7b' # "llava-v1.6-mistral-7b", "llava-v1.6-34b", "llava-v1.6-vicuna-13b", "llava-v1.6-vicuna-7b",
         OCR_option_llava_bit = 'full' # full or 4bit
         double_OCR = False
+        tool_GEO = True
+        tool_WFO = True
+        tool_wikipedia = True
         check_for_illegal_filenames = False
         LLM_version_user = 'Azure GPT 3.5 Instruct' #'Azure GPT 4 Turbo 1106-preview'
+        prompt_version = 'SLTPvA_long.yaml' # from ["Version 1", "Version 1 No Domain Knowledge", "Version 2"]
+        use_LeafMachine2_collage_images = True # Use LeafMachine2 collage images
         do_create_OCR_helper_image = True
         batch_size = 500
         skip_vertical = False
         pdf_conversion_dpi = 100
+        path_domain_knowledge = '' #os.path.join(dir_home,'domain_knowledge','SLTP_UM_AllAsiaMinimalInRegion.xlsx')
+        embeddings_database_name = '' #os.path.splitext(os.path.basename(path_domain_knowledge))[0]
         #############################################
         #############################################
         return assemble_config(dir_home, run_name, dir_images_local,dir_output,
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
+                        prompt_version, do_create_OCR_helper_image, do_use_trOCR, trOCR_model_path, OCR_option, OCR_option_llava,
+                        OCR_option_llava_bit, double_OCR, save_cropped_annotations,
+                        tool_GEO, tool_WFO, tool_wikipedia,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
     else:
         dir_home = os.path.dirname(os.path.dirname(__file__))
         catalog_numerical_only = loaded_cfg['leafmachine']['project']['catalog_numerical_only']
         do_use_trOCR = loaded_cfg['leafmachine']['project']['do_use_trOCR']
+        trOCR_model_path = loaded_cfg['leafmachine']['project']['trOCR_model_path']
         OCR_option = loaded_cfg['leafmachine']['project']['OCR_option']
         OCR_option_llava = loaded_cfg['leafmachine']['project']['OCR_option_llava']
         OCR_option_llava_bit  = loaded_cfg['leafmachine']['project']['OCR_option_llava_bit']
         double_OCR = loaded_cfg['leafmachine']['project']['double_OCR']
+        tool_GEO = loaded_cfg['leafmachine']['project']['tool_GEO']
+        tool_WFO = loaded_cfg['leafmachine']['project']['tool_WFO']
+        tool_wikipedia = loaded_cfg['leafmachine']['project']['tool_wikipedia']
         pdf_conversion_dpi = loaded_cfg['leafmachine']['project']['pdf_conversion_dpi']
         LLM_version_user = loaded_cfg['leafmachine']['LLM_version']
         return assemble_config(dir_home, run_name, dir_images_local,dir_output,
                         prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                         path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
+                        prompt_version, do_create_OCR_helper_image, do_use_trOCR, trOCR_model_path, OCR_option, OCR_option_llava,
+                        OCR_option_llava_bit, double_OCR, save_cropped_annotations,
+                        tool_GEO, tool_WFO, tool_wikipedia,
                         check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False)
 def assemble_config(dir_home, run_name, dir_images_local,dir_output,
                     prefix_removal,suffix_removal,catalog_numerical_only,LLM_version_user,batch_size,num_workers,
                     path_domain_knowledge,embeddings_database_name,use_LeafMachine2_collage_images,
+                    prompt_version, do_create_OCR_helper_image_user, do_use_trOCR, trOCR_model_path, OCR_option, OCR_option_llava,
+                    OCR_option_llava_bit, double_OCR, save_cropped_annotations,
+                    tool_GEO, tool_WFO, tool_wikipedia,
                     check_for_illegal_filenames, skip_vertical, pdf_conversion_dpi, use_domain_knowledge=False):
         'delete_all_temps': False,
         'delete_temps_keep_VVE': False,
         'do_use_trOCR': do_use_trOCR,
+        'trOCR_model_path': trOCR_model_path,
         'OCR_option': OCR_option,
         'OCR_option_llava': OCR_option_llava,
         'OCR_option_llava_bit': OCR_option_llava_bit,
         'double_OCR': double_OCR,
         'pdf_conversion_dpi': pdf_conversion_dpi,
+        'tool_GEO': tool_GEO,
+        'tool_WFO': tool_WFO,
+        'tool_wikipedia': tool_wikipedia,
     }
     modules_section = {

vouchervision/model_maps.py CHANGED Viewed

@@ -206,7 +206,7 @@ class ModelMaps:
             return "text-unicorn@001"
         elif key == 'GEMINI_PRO':
-            return "gemini-pro"
         ### Mistral
         elif key == 'MISTRAL_TINY':

             return "text-unicorn@001"
         elif key == 'GEMINI_PRO':
+            return "gemini-1.0-pro"
         ### Mistral
         elif key == 'MISTRAL_TINY':

vouchervision/tool_geolocate_HERE.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import os, requests
+import pycountry_convert as pc
+import unicodedata
+import pycountry_convert as pc
+import warnings
+def normalize_country_name(name):
+    return unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
+def get_continent(country_name):
+    warnings.filterwarnings("ignore", category=UserWarning, module='pycountry')
+    continent_code_to_name = {
+        "AF": "Africa",
+        "NA": "North America",
+        "OC": "Oceania",
+        "AN": "Antarctica",
+        "AS": "Asia",
+        "EU": "Europe",
+        "SA": "South America"
+    }
+    try:
+        normalized_country_name = normalize_country_name(country_name)
+        # Get country alpha2 code
+        country_code = pc.country_name_to_country_alpha2(normalized_country_name)
+        # Get continent code from country alpha2 code
+        continent_code = pc.country_alpha2_to_continent_code(country_code)
+        # Map the continent code to continent name
+        return continent_code_to_name.get(continent_code, '')
+    except Exception as e:
+        print(str(e))
+        return ''
+def validate_coordinates_here(tool_GEO, record, replace_if_success_geo=False):
+    forward_url = 'https://geocode.search.hereapi.com/v1/geocode'
+    reverse_url = 'https://revgeocode.search.hereapi.com/v1/revgeocode'
+    pinpoint = ['GEO_city','GEO_county','GEO_state','GEO_country',]
+    GEO_dict_null = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    GEO_dict = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    GEO_dict_rev = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    GEO_dict_rev_verbatim = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    GEO_dict_forward = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    GEO_dict_forward_locality = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    if not tool_GEO:
+        return record, GEO_dict_null
+    else:
+        # For production
+        query_forward = ', '.join(filter(None, [record.get('municipality', '').strip(),
+                                            record.get('county', '').strip(),
+                                            record.get('stateProvince', '').strip(),
+                                            record.get('country', '').strip()])).strip()
+        query_forward_locality = ', '.join(filter(None, [record.get('locality', '').strip(),
+                                            record.get('municipality', '').strip(),
+                                            record.get('county', '').strip(),
+                                            record.get('stateProvince', '').strip(),
+                                            record.get('country', '').strip()])).strip()
+        query_reverse = ','.join(filter(None, [record.get('decimalLatitude', '').strip(),
+                                            record.get('decimalLongitude', '').strip()])).strip()
+        query_reverse_verbatim = record.get('verbatimCoordinates', '').strip()
+        '''
+        #For testing
+        # query_forward = 'Ann bor, michign'
+        query_forward = 'michigan'
+        query_forward_locality = 'Ann bor, michign'
+        # query_gps = "42 N,-83 W" # cannot have any spaces
+        # query_reverse_verbatim = "42.278366,-83.744718" # cannot have any spaces
+        query_reverse_verbatim = "42,-83" # cannot have any spaces
+        query_reverse = "42,-83" # cannot have any spaces
+        # params = {
+        #     'q': query_loc,
+        #     'apiKey': os.environ['HERE_API_KEY'],
+        # }'''
+        params_rev = {
+            'at': query_reverse,
+            'apiKey': os.environ['HERE_API_KEY'],
+            'lang': 'en',
+        }
+        params_reverse_verbatim = {
+            'at': query_reverse_verbatim,
+            'apiKey': os.environ['HERE_API_KEY'],
+            'lang': 'en',
+        }
+        params_forward = {
+            'q': query_forward,
+            'apiKey': os.environ['HERE_API_KEY'],
+            'lang': 'en',
+        }
+        params_forward_locality = {
+            'q': query_forward_locality,
+            'apiKey': os.environ['HERE_API_KEY'],
+            'lang': 'en',
+        }
+        ### REVERSE
+        # If there are two string in the coordinates, try a reverse first based on the literal coordinates
+        response = requests.get(reverse_url, params=params_rev)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get('items'):
+                first_result = data['items'][0]
+                GEO_dict_rev['GEO_method'] = 'HERE_Geocode_reverse'
+                GEO_dict_rev['GEO_formatted_full_string'] = first_result.get('title', '')
+                GEO_dict_rev['GEO_decimal_lat'] = first_result['position']['lat']
+                GEO_dict_rev['GEO_decimal_long'] = first_result['position']['lng']
+                address = first_result.get('address', {})
+                GEO_dict_rev['GEO_city'] = address.get('city', '')
+                GEO_dict_rev['GEO_county'] = address.get('county', '')
+                GEO_dict_rev['GEO_state'] = address.get('state', '')
+                GEO_dict_rev['GEO_state_code'] = address.get('stateCode', '')
+                GEO_dict_rev['GEO_country'] = address.get('countryName', '')
+                GEO_dict_rev['GEO_country_code'] = address.get('countryCode', '')
+                GEO_dict_rev['GEO_continent'] = get_continent(address.get('countryName', ''))
+        ### REVERSE Verbatim
+        # If there are two string in the coordinates, try a reverse first based on the literal coordinates
+        if GEO_dict_rev['GEO_city']: # If the reverse was successful, pass
+            GEO_dict = GEO_dict_rev
+        else:
+            response = requests.get(reverse_url, params=params_reverse_verbatim)
+            if response.status_code == 200:
+                data = response.json()
+                if data.get('items'):
+                    first_result = data['items'][0]
+                    GEO_dict_rev_verbatim['GEO_method'] = 'HERE_Geocode_reverse_verbatimCoordinates'
+                    GEO_dict_rev_verbatim['GEO_formatted_full_string'] = first_result.get('title', '')
+                    GEO_dict_rev_verbatim['GEO_decimal_lat'] = first_result['position']['lat']
+                    GEO_dict_rev_verbatim['GEO_decimal_long'] = first_result['position']['lng']
+                    address = first_result.get('address', {})
+                    GEO_dict_rev_verbatim['GEO_city'] = address.get('city', '')
+                    GEO_dict_rev_verbatim['GEO_county'] = address.get('county', '')
+                    GEO_dict_rev_verbatim['GEO_state'] = address.get('state', '')
+                    GEO_dict_rev_verbatim['GEO_state_code'] = address.get('stateCode', '')
+                    GEO_dict_rev_verbatim['GEO_country'] = address.get('countryName', '')
+                    GEO_dict_rev_verbatim['GEO_country_code'] = address.get('countryCode', '')
+                    GEO_dict_rev_verbatim['GEO_continent'] = get_continent(address.get('countryName', ''))
+        ### FORWARD
+        ### Try forward, if failes, try reverse using deci, then verbatim
+        if GEO_dict_rev['GEO_city']: # If the reverse was successful, pass
+            GEO_dict = GEO_dict_rev
+        elif GEO_dict_rev_verbatim['GEO_city']:
+            GEO_dict = GEO_dict_rev_verbatim
+        else:
+            response = requests.get(forward_url, params=params_forward)
+            if response.status_code == 200:
+                data = response.json()
+                if data.get('items'):
+                    first_result = data['items'][0]
+                    GEO_dict_forward['GEO_method'] = 'HERE_Geocode_forward'
+                    GEO_dict_forward['GEO_formatted_full_string'] = first_result.get('title', '')
+                    GEO_dict_forward['GEO_decimal_lat'] = first_result['position']['lat']
+                    GEO_dict_forward['GEO_decimal_long'] = first_result['position']['lng']
+                    address = first_result.get('address', {})
+                    GEO_dict_forward['GEO_city'] = address.get('city', '')
+                    GEO_dict_forward['GEO_county'] = address.get('county', '')
+                    GEO_dict_forward['GEO_state'] = address.get('state', '')
+                    GEO_dict_forward['GEO_state_code'] = address.get('stateCode', '')
+                    GEO_dict_forward['GEO_country'] = address.get('countryName', '')
+                    GEO_dict_forward['GEO_country_code'] = address.get('countryCode', '')
+                    GEO_dict_forward['GEO_continent'] = get_continent(address.get('countryName', ''))
+        ### FORWARD locality
+        ### Try forward, if failes, try reverse using deci, then verbatim
+        if GEO_dict_rev['GEO_city']: # If the reverse was successful, pass
+            GEO_dict = GEO_dict_rev
+        elif GEO_dict_rev_verbatim['GEO_city']:
+            GEO_dict = GEO_dict_rev_verbatim
+        elif GEO_dict_forward['GEO_city']:
+            GEO_dict = GEO_dict_forward
+        else:
+            response = requests.get(forward_url, params=params_forward_locality)
+            if response.status_code == 200:
+                data = response.json()
+                if data.get('items'):
+                    first_result = data['items'][0]
+                    GEO_dict_forward_locality['GEO_method'] = 'HERE_Geocode_forward_locality'
+                    GEO_dict_forward_locality['GEO_formatted_full_string'] = first_result.get('title', '')
+                    GEO_dict_forward_locality['GEO_decimal_lat'] = first_result['position']['lat']
+                    GEO_dict_forward_locality['GEO_decimal_long'] = first_result['position']['lng']
+                    address = first_result.get('address', {})
+                    GEO_dict_forward_locality['GEO_city'] = address.get('city', '')
+                    GEO_dict_forward_locality['GEO_county'] = address.get('county', '')
+                    GEO_dict_forward_locality['GEO_state'] = address.get('state', '')
+                    GEO_dict_forward_locality['GEO_state_code'] = address.get('stateCode', '')
+                    GEO_dict_forward_locality['GEO_country'] = address.get('countryName', '')
+                    GEO_dict_forward_locality['GEO_country_code'] = address.get('countryCode', '')
+                    GEO_dict_forward_locality['GEO_continent'] = get_continent(address.get('countryName', ''))
+        # print(json.dumps(GEO_dict,indent=4))
+        # Pick the most detailed version
+        # if GEO_dict_rev['GEO_formatted_full_string'] and GEO_dict_forward['GEO_formatted_full_string']:
+        for loc in pinpoint:
+            rev = GEO_dict_rev.get(loc,'')
+            forward = GEO_dict_forward.get(loc,'')
+            forward_locality = GEO_dict_forward_locality.get(loc,'')
+            rev_verbatim = GEO_dict_rev_verbatim.get(loc,'')
+            if not rev and not forward and not forward_locality and not rev_verbatim:
+                pass
+            elif rev:
+                GEO_dict = GEO_dict_rev
+                break
+            elif forward:
+                GEO_dict = GEO_dict_forward
+                break
+            elif forward_locality:
+                GEO_dict = GEO_dict_forward_locality
+                break
+            elif rev_verbatim:
+                GEO_dict = GEO_dict_rev_verbatim
+                break
+            else:
+                GEO_dict = GEO_dict_null
+        if GEO_dict['GEO_formatted_full_string'] and replace_if_success_geo:
+            GEO_dict['GEO_override_OCR'] = True
+            record['country'] = GEO_dict.get('GEO_country')
+            record['stateProvince'] = GEO_dict.get('GEO_state')
+            record['county'] = GEO_dict.get('GEO_county')
+            record['municipality'] = GEO_dict.get('GEO_city')
+        # print(json.dumps(GEO_dict,indent=4))
+        return record, GEO_dict
+if __name__ == "__main__":
+    validate_coordinates_here(None)

vouchervision/tool_taxonomy_WFO.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import requests
+from urllib.parse import urlencode
+from Levenshtein import ratio
+from fuzzywuzzy import fuzz
+class WFONameMatcher:
+    def __init__(self, tool_WFO):
+        self.base_url = "https://list.worldfloraonline.org/matching_rest.php?"
+        self.N_BEST_CANDIDATES = 10
+        self.NULL_DICT = {
+                        "WFO_exact_match": False,
+                        "WFO_exact_match_name": "",
+                        "WFO_candidate_names": "",
+                        "WFO_best_match": "",
+                        "WFO_placement": "",
+                        "WFO_override_OCR": False,
+                    }
+        self.SEP = '|'
+        self.is_enabled = tool_WFO
+    def extract_input_string(self, record):
+        primary_input = f"{record.get('scientificName', '').strip()} {record.get('scientificNameAuthorship', '').strip()}".strip()
+        secondary_input = ' '.join(filter(None, [record.get('genus', '').strip(),
+                                                 record.get('subgenus', '').strip(),
+                                                 record.get('specificEpithet', '').strip(),
+                                                 record.get('infraspecificEpithet', '').strip()])).strip()
+        return primary_input, secondary_input
+    def query_wfo_name_matching(self, input_string, check_homonyms=True, check_rank=True, accept_single_candidate=True):
+        params = {
+            "input_string": input_string,
+            "check_homonyms": check_homonyms,
+            "check_rank": check_rank,
+            "method": "full",
+            "accept_single_candidate": accept_single_candidate,
+        }
+        full_url = self.base_url + urlencode(params)
+        response = requests.get(full_url)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            return {"error": True, "message": "Failed to fetch data from WFO API"}
+    def query_and_process(self, record):
+        primary_input, secondary_input = self.extract_input_string(record)
+        # Query with primary input
+        primary_result = self.query_wfo_name_matching(primary_input)
+        primary_processed, primary_ranked_candidates = self.process_wfo_response(primary_result, primary_input)
+        if primary_processed.get('WFO_exact_match'):
+            print("Selected Primary --- Exact Primary & Unchecked Secondary")
+            return primary_processed
+        else:
+            # Query with secondary input
+            secondary_result = self.query_wfo_name_matching(secondary_input)
+            secondary_processed, secondary_ranked_candidates = self.process_wfo_response(secondary_result, secondary_input)
+            if secondary_processed.get('WFO_exact_match'):
+                print("Selected Secondary --- Unchecked Primary & Exact Secondary")
+                return secondary_processed
+            else:
+                # Both failed, just return the first failure
+                if (primary_processed.get("WFO_candidate_names") == '') and (secondary_processed.get("WFO_candidate_names") == ''):
+                    print("Selected Primary --- Failed Primary & Failed Secondary")
+                    return primary_processed
+                # 1st failed, just return the second
+                elif (primary_processed.get("WFO_candidate_names") == '') and (len(secondary_processed.get("WFO_candidate_names")) > 0):
+                    print("Selected Secondary --- Failed Primary & Partial Secondary")
+                    return secondary_processed
+                # 2nd failed, just return the first
+                elif (len(primary_processed.get("WFO_candidate_names")) > 0) and (secondary_processed.get("WFO_candidate_names") == ''):
+                    print("Selected Primary --- Partial Primary & Failed Secondary")
+                    return primary_processed
+                # Both have partial matches, compare and rerank
+                elif (len(primary_processed.get("WFO_candidate_names")) > 0) and (len(secondary_processed.get("WFO_candidate_names")) > 0):
+                    # Combine and sort results, ensuring no duplicates
+                    combined_candidates = list(set(primary_ranked_candidates + secondary_ranked_candidates))
+                    combined_candidates.sort(key=lambda x: (x[1], x[0]), reverse=True)  # Sort by similarity score, then name
+                    # Replace candidates with combined_candidates and combined best match
+                    best_score_primary = primary_processed["WFO_candidate_names"][0][1]
+                    best_score_secondary = secondary_processed["WFO_candidate_names"][0][1]
+                    # Extracting only the candidate names from the top candidates
+                    top_candidates = combined_candidates[:self.N_BEST_CANDIDATES]
+                    cleaned_candidates = [cand[0] for cand in top_candidates]
+                    if best_score_primary >= best_score_secondary:
+                        primary_processed["WFO_candidate_names"] = cleaned_candidates
+                        primary_processed["WFO_best_match"] = cleaned_candidates[0]
+                        response_placement = self.query_wfo_name_matching(primary_processed["WFO_best_match"])
+                        placement_exact_match = response_placement.get("match")
+                        primary_processed["WFO_placement"] = placement_exact_match.get("placement", '')
+                        print("Selected Primary --- Partial Primary & Partial Secondary")
+                        return primary_processed
+                    else:
+                        secondary_processed["WFO_candidate_names"] = cleaned_candidates
+                        secondary_processed["WFO_best_match"] = cleaned_candidates[0]
+                        response_placement = self.query_wfo_name_matching(secondary_processed["WFO_best_match"])
+                        placement_exact_match = response_placement.get("match")
+                        secondary_processed["WFO_placement"] = placement_exact_match.get("placement", '')
+                        print("Selected Secondary --- Partial Primary & Partial Secondary")
+                        return secondary_processed
+                else:
+                    return self.NULL_DICT
+    def process_wfo_response(self, response, query):
+        simplified_response = {}
+        ranked_candidates = None
+        exact_match = response.get("match")
+        simplified_response["WFO_exact_match"] = bool(exact_match)
+        candidates = response.get("candidates", [])
+        candidate_names = [candidate["full_name_plain"] for candidate in candidates] if candidates else []
+        if not exact_match and candidate_names:
+            cleaned_candidates, ranked_candidates = self._rank_candidates_by_similarity(query, candidate_names)
+            simplified_response["WFO_candidate_names"] = cleaned_candidates
+            simplified_response["WFO_best_match"] = cleaned_candidates[0] if cleaned_candidates else ''
+        elif exact_match:
+            simplified_response["WFO_candidate_names"] = exact_match.get("full_name_plain")
+            simplified_response["WFO_best_match"] = exact_match.get("full_name_plain")
+        else:
+            simplified_response["WFO_candidate_names"] = ''
+            simplified_response["WFO_best_match"] = ''
+        # Call WFO again to update placement using WFO_best_match
+        try:
+            response_placement = self.query_wfo_name_matching(simplified_response["WFO_best_match"])
+            placement_exact_match = response_placement.get("match")
+            simplified_response["WFO_placement"] = placement_exact_match.get("placement", '')
+        except:
+            simplified_response["WFO_placement"] = ''
+        return simplified_response, ranked_candidates
+    def _rank_candidates_by_similarity(self, query, candidates):
+        string_similarities = []
+        fuzzy_similarities = {candidate: fuzz.ratio(query, candidate) for candidate in candidates}
+        query_words = query.split()
+        for candidate in candidates:
+            candidate_words = candidate.split()
+            # Calculate word similarities and sum them up
+            word_similarities = [ratio(query_word, candidate_word) for query_word, candidate_word in zip(query_words, candidate_words)]
+            total_word_similarity = sum(word_similarities)
+            # Calculate combined similarity score (average of word and fuzzy similarities)
+            fuzzy_similarity = fuzzy_similarities[candidate]
+            combined_similarity = (total_word_similarity + fuzzy_similarity) / 2
+            string_similarities.append((candidate, combined_similarity))
+        # Sort the candidates based on combined similarity, higher scores first
+        ranked_candidates = sorted(string_similarities, key=lambda x: x[1], reverse=True)
+        # Extracting only the candidate names from the top candidates
+        top_candidates = ranked_candidates[:self.N_BEST_CANDIDATES]
+        cleaned_candidates = [cand[0] for cand in top_candidates]
+        return cleaned_candidates, ranked_candidates
+    def check_WFO(self, record, replace_if_success_wfo):
+        if not self.is_enabled:
+            return record, self.NULL_DICT
+        else:
+            self.replace_if_success_wfo = replace_if_success_wfo
+            # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
+            simplified_response = self.query_and_process(record)
+            simplified_response['WFO_override_OCR'] = False
+            # best_match
+            if simplified_response.get('WFO_exact_match'):
+                simplified_response['WFO_exact_match_name'] = simplified_response.get('WFO_best_match')
+            else:
+                simplified_response['WFO_exact_match_name'] = ''
+            # placement
+            wfo_placement = simplified_response.get('WFO_placement', '')
+            if wfo_placement:
+                parts = wfo_placement.split('/')[1:]
+                simplified_response['WFO_placement'] = self.SEP.join(parts)
+            else:
+                simplified_response['WFO_placement'] = ''
+            if simplified_response.get('WFO_exact_match') and replace_if_success_wfo:
+                simplified_response['WFO_override_OCR'] = True
+                name_parts = simplified_response.get('WFO_placement').split('$')[0]
+                name_parts = name_parts.split(self.SEP)
+                record['order'] = name_parts[3]
+                record['family'] = name_parts[4]
+                record['genus'] = name_parts[5]
+                record['specificEpithet'] = name_parts[6]
+                record['scientificName'] = simplified_response.get('WFO_exact_match_name')
+            return record, simplified_response
+def validate_taxonomy_WFO(tool_WFO, record_dict, replace_if_success_wfo=False):
+    Matcher = WFONameMatcher(tool_WFO)
+    try:
+        record_dict, WFO_dict = Matcher.check_WFO(record_dict, replace_if_success_wfo)
+        return record_dict, WFO_dict
+    except:
+        return record_dict, Matcher.NULL_DICT
+'''
+if __name__ == "__main__":
+    Matcher = WFONameMatcher()
+    # input_string = "Rhopalocarpus alterfolius"
+    record_exact_match ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "Hypericum prolificum",
+        "scientificNameAuthorship": "",
+        "genus": "Hypericum",
+        "subgenus": "",
+        "specificEpithet": "prolificum",
+        "infraspecificEpithet": "",
+    }
+    record_partialPrimary_exactSecondary ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "Hyperic prolificum",
+        "scientificNameAuthorship": "",
+        "genus": "Hypericum",
+        "subgenus": "",
+        "specificEpithet": "prolificum",
+        "infraspecificEpithet": "",
+    }
+    record_exactPrimary_partialSecondary ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "Hypericum prolificum",
+        "scientificNameAuthorship": "",
+        "genus": "Hyperic",
+        "subgenus": "",
+        "specificEpithet": "prolificum",
+        "infraspecificEpithet": "",
+    }
+    record_partialPrimary_partialSecondary ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "Hyperic prolificum",
+        "scientificNameAuthorship": "",
+        "genus": "Hypericum",
+        "subgenus": "",
+        "specificEpithet": "prolific",
+        "infraspecificEpithet": "",
+    }
+    record_partialPrimary_partialSecondary_swap ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "Hypericum prolific",
+        "scientificNameAuthorship": "",
+        "genus": "Hyperic",
+        "subgenus": "",
+        "specificEpithet": "prolificum",
+        "infraspecificEpithet": "",
+    }
+    record_errorPrimary_partialSecondary ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "ricum proli",
+        "scientificNameAuthorship": "",
+        "genus": "Hyperic",
+        "subgenus": "",
+        "specificEpithet": "prolificum",
+        "infraspecificEpithet": "",
+    }
+    record_partialPrimary_errorSecondary ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "Hyperic prolificum",
+        "scientificNameAuthorship": "",
+        "genus": "ricum",
+        "subgenus": "",
+        "specificEpithet": "proli",
+        "infraspecificEpithet": "",
+    }
+    record_errorPrimary_errorSecondary ={
+        "order": "Malpighiales",
+        "family": "Hypericaceae",
+        "scientificName": "ricum proli",
+        "scientificNameAuthorship": "",
+        "genus": "ricum",
+        "subgenus": "",
+        "specificEpithet": "proli",
+        "infraspecificEpithet": "",
+    }
+    options = [record_exact_match,
+               record_partialPrimary_exactSecondary,
+               record_exactPrimary_partialSecondary,
+               record_partialPrimary_partialSecondary,
+               record_partialPrimary_partialSecondary_swap,
+               record_errorPrimary_partialSecondary,
+               record_partialPrimary_errorSecondary,
+               record_errorPrimary_errorSecondary]
+    for opt in options:
+        simplified_response = Matcher.check_WFO(opt)
+        print(json.dumps(simplified_response, indent=4))
+'''

vouchervision/tool_wikipedia.py CHANGED Viewed

@@ -8,7 +8,8 @@ import pstats
 class WikipediaLinks():
-    def __init__(self, json_file_path_wiki) -> None:
         self.json_file_path_wiki = json_file_path_wiki
         self.wiki_wiki = wikipediaapi.Wikipedia(
             user_agent='VoucherVision ([email protected])',
@@ -466,54 +467,56 @@ class WikipediaLinks():
         self.info_packet['WIKI_GEO'] = {}
         self.info_packet['WIKI_LOCALITY'] = {}
-        municipality = output.get('municipality','')
-        county = output.get('county','')
-        stateProvince = output.get('stateProvince','')
-        country = output.get('country','')
-        locality = output.get('locality','')
-        order = output.get('order','')
-        family = output.get('family','')
-        scientificName = output.get('scientificName','')
-        genus = output.get('genus','')
-        specificEpithet = output.get('specificEpithet','')
-        query_geo = ' '.join([municipality, county, stateProvince, country]).strip()
-        query_locality = locality.strip()
-        query_taxa_primary = scientificName.strip()
-        query_taxa_secondary = ' '.join([genus, specificEpithet]).strip()
-        query_taxa_tertiary = ' '.join([order, family, genus, specificEpithet]).strip()
-        # query_taxa = "Tracaulon sagittatum Tracaulon sagittatum"
-        # query_geo = "Indiana Porter Co."
-        # query_locality = "Mical Springs edge"
-        if query_geo:
-            try:
-                self.gather_geo(query_geo)
-            except:
-                pass
-        if query_locality:
-            try:
-                self.gather_geo(query_locality,'locality')
-            except:
-                pass
-        queries_taxa = [query_taxa_primary, query_taxa_secondary, query_taxa_tertiary]
-        for q in queries_taxa:
-            if q:
                 try:
-                    self.gather_taxonomy(q)
-                    break
                 except:
                     pass
-        # print(self.info_packet)
-        # return self.info_packet
-        # self.gather_geo(query_geo)
         try:
             with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
                 json.dump(self.info_packet, file, indent=4)
@@ -547,6 +550,13 @@ class WikipediaLinks():
         return clean_text
 if __name__ == '__main__':
     test_output = {
     "filename": "MICH_7375774_Polygonaceae_Persicaria_",

 class WikipediaLinks():
+    def __init__(self, tool_wikipedia, json_file_path_wiki) -> None:
+        self.is_enabled = tool_wikipedia
         self.json_file_path_wiki = json_file_path_wiki
         self.wiki_wiki = wikipediaapi.Wikipedia(
             user_agent='VoucherVision ([email protected])',
         self.info_packet['WIKI_GEO'] = {}
         self.info_packet['WIKI_LOCALITY'] = {}
+        if self.is_enabled:
+            municipality = output.get('municipality','')
+            county = output.get('county','')
+            stateProvince = output.get('stateProvince','')
+            country = output.get('country','')
+            locality = output.get('locality','')
+            order = output.get('order','')
+            family = output.get('family','')
+            scientificName = output.get('scientificName','')
+            genus = output.get('genus','')
+            specificEpithet = output.get('specificEpithet','')
+            query_geo = ' '.join([municipality, county, stateProvince, country]).strip()
+            query_locality = locality.strip()
+            query_taxa_primary = scientificName.strip()
+            query_taxa_secondary = ' '.join([genus, specificEpithet]).strip()
+            query_taxa_tertiary = ' '.join([order, family, genus, specificEpithet]).strip()
+            # query_taxa = "Tracaulon sagittatum Tracaulon sagittatum"
+            # query_geo = "Indiana Porter Co."
+            # query_locality = "Mical Springs edge"
+            if query_geo:
                 try:
+                    self.gather_geo(query_geo)
                 except:
                     pass
+            if query_locality:
+                try:
+                    self.gather_geo(query_locality,'locality')
+                except:
+                    pass
+            queries_taxa = [query_taxa_primary, query_taxa_secondary, query_taxa_tertiary]
+            for q in queries_taxa:
+                if q:
+                    try:
+                        self.gather_taxonomy(q)
+                        break
+                    except:
+                        pass
+            # print(self.info_packet)
+            # return self.info_packet
+            # self.gather_geo(query_geo)
         try:
             with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
                 json.dump(self.info_packet, file, indent=4)
         return clean_text
+def validate_wikipedia(tool_wikipedia, json_file_path_wiki, output):
+    Wiki = WikipediaLinks(tool_wikipedia, json_file_path_wiki)
+    Wiki.gather_wikipedia_results(output)
 if __name__ == '__main__':
     test_output = {
     "filename": "MICH_7375774_Polygonaceae_Persicaria_",

vouchervision/utils_LLM.py CHANGED Viewed

@@ -8,6 +8,60 @@ import psutil
 import threading
 import torch
 from datetime import datetime
 def save_individual_prompt(prompt_template, txt_file_path_ind_prompt):
     with open(txt_file_path_ind_prompt, 'w',encoding='utf-8') as file:
@@ -19,6 +73,16 @@ def remove_colons_and_double_apostrophes(text):
     return text.replace(":", "").replace("\"", "")
 def count_tokens(string, vendor, model_name):
     full_string = string + JSON_FORMAT_INSTRUCTIONS

 import threading
 import torch
 from datetime import datetime
+from vouchervision.tool_taxonomy_WFO import validate_taxonomy_WFO, WFONameMatcher
+from vouchervision.tool_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import validate_wikipedia
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def run_tools(output, tool_WFO, tool_GEO, tool_wikipedia, json_file_path_wiki):
+    # Define a function that will catch and return the results of your functions
+    def task(func, *args, **kwargs):
+        return func(*args, **kwargs)
+    # List of tasks to run in separate threads
+    tasks = [
+        (validate_taxonomy_WFO, (tool_WFO, output, False)),
+        (validate_coordinates_here, (tool_GEO, output, False)),
+        (validate_wikipedia, (tool_wikipedia, json_file_path_wiki, output)),
+    ]
+    # Results storage
+    results = {}
+    # Use ThreadPoolExecutor to execute each function in its own thread
+    with ThreadPoolExecutor() as executor:
+        future_to_func = {executor.submit(task, func, *args): func.__name__ for func, args in tasks}
+        for future in as_completed(future_to_func):
+            func_name = future_to_func[future]
+            try:
+                # Collecting results
+                results[func_name] = future.result()
+            except Exception as exc:
+                print(f'{func_name} generated an exception: {exc}')
+    # Here, all threads have completed
+    # Extracting results
+    Matcher = WFONameMatcher(tool_WFO)
+    GEO_dict_null = {
+        'GEO_override_OCR': False,
+        'GEO_method': '',
+        'GEO_formatted_full_string': '',
+        'GEO_decimal_lat': '',
+        'GEO_decimal_long': '',
+        'GEO_city': '',
+        'GEO_county': '',
+        'GEO_state': '',
+        'GEO_state_code': '',
+        'GEO_country': '',
+        'GEO_country_code': '',
+        'GEO_continent': '',
+    }
+    output_WFO, WFO_record = results.get('validate_taxonomy_WFO', (output, Matcher.NULL_DICT))
+    output_GEO, GEO_record = results.get('validate_coordinates_here', (output, GEO_dict_null))
+    return output_WFO, WFO_record, output_GEO, GEO_record
 def save_individual_prompt(prompt_template, txt_file_path_ind_prompt):
     with open(txt_file_path_ind_prompt, 'w',encoding='utf-8') as file:
     return text.replace(":", "").replace("\"", "")
+def sanitize_prompt(data):
+    if isinstance(data, dict):
+        return {sanitize_prompt(key): sanitize_prompt(value) for key, value in data.items()}
+    elif isinstance(data, list):
+        return [sanitize_prompt(element) for element in data]
+    elif isinstance(data, str):
+        return data.encode('utf-8', 'ignore').decode('utf-8')
+    else:
+        return data
 def count_tokens(string, vendor, model_name):
     full_string = string + JSON_FORMAT_INSTRUCTIONS

vouchervision/utils_VoucherVision.py CHANGED Viewed

@@ -43,7 +43,7 @@ class VoucherVision():
         self.prompt_version = None
         self.is_hf = is_hf
-        self.trOCR_model_version = "microsoft/trocr-large-handwritten"
         # self.trOCR_model_version = "microsoft/trocr-base-handwritten"
         # self.trOCR_model_version = "dh-unibe/trocr-medieval-escriptmask" # NOPE
         # self.trOCR_model_version = "dh-unibe/trocr-kurrent" # NOPE
@@ -59,6 +59,8 @@ class VoucherVision():
         self.logger.name = f'[Transcription]'
         self.logger.info(f'Setting up OCR and LLM')
         self.db_name = self.cfg['leafmachine']['project']['embeddings_database_name']
         self.path_domain_knowledge = self.cfg['leafmachine']['project']['path_to_domain_knowledge_xlsx']
         self.build_new_db = self.cfg['leafmachine']['project']['build_new_embeddings_database']
@@ -83,7 +85,7 @@ class VoucherVision():
         self.wfo_headers = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"]
         self.wfo_headers_no_lists = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_placement"]
-        self.utility_headers = ["filename"] + self.wfo_headers + self.geo_headers + self.usage_headers + ["run_name", "prompt", "LLM", "tokens_in", "tokens_out", "path_to_crop","path_to_original","path_to_content","path_to_helper",]
                                 # "WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement",
                                 # "GEO_override_OCR", "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat",
@@ -298,7 +300,8 @@ class VoucherVision():
                     break
-    def add_data_to_excel_from_response(self, Dirs, path_transcription, response, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, path_to_content, path_to_helper, nt_in, nt_out):
         wb = openpyxl.load_workbook(path_transcription)
@@ -367,7 +370,17 @@ class VoucherVision():
                 sheet.cell(row=next_row, column=i, value=os.path.basename(self.path_custom_prompts))
             elif header.value == "run_name":
                 sheet.cell(row=next_row, column=i, value=Dirs.run_name)
             # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
             elif header.value in self.wfo_headers_no_lists:
                 sheet.cell(row=next_row, column=i, value=WFO_record.get(header.value, ''))
@@ -404,10 +417,11 @@ class VoucherVision():
     def has_API_key(self, val):
-        if val != '':
-            return True
-        else:
-            return False
     def get_google_credentials(self): # Also used for google drive
@@ -460,6 +474,7 @@ class VoucherVision():
         self.has_key_openai = self.has_API_key(k_openai)
         self.has_key_azure_openai = self.has_API_key(k_openai_azure)
         self.has_key_google_project_id = self.has_API_key(k_google_project_id)
         self.has_key_google_location = self.has_API_key(k_google_location)
@@ -470,12 +485,15 @@ class VoucherVision():
         self.has_key_open_cage_geocode = self.has_API_key(k_opencage)
         ### Google - OCR, Palm2, Gemini
         if self.has_key_google_application_credentials and self.has_key_google_project_id and self.has_key_google_location:
             if self.is_hf:
                 vertexai.init(project=os.getenv('GOOGLE_PROJECT_ID'), location=os.getenv('GOOGLE_LOCATION'), credentials=self.get_google_credentials())
             else:
                 vertexai.init(project=k_google_project_id, location=k_google_location, credentials=self.get_google_credentials())
         ### OpenAI
         if self.has_key_openai:
@@ -497,7 +515,6 @@ class VoucherVision():
                     azure_endpoint = os.getenv('AZURE_API_BASE'),
                     openai_organization = os.getenv('AZURE_ORGANIZATION'),
                 )
-                self.has_key_azure_openai = True
             else:
                 # Initialize the Azure OpenAI client
@@ -508,7 +525,6 @@ class VoucherVision():
                     azure_endpoint = self.cfg_private['openai_azure']['OPENAI_API_BASE'],
                     openai_organization = self.cfg_private['openai_azure']['OPENAI_ORGANIZATION'],
                 )
-                self.has_key_azure_openai = True
         ### Mistral
@@ -624,6 +640,7 @@ class VoucherVision():
         ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
         ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
         self.OCR = ocr_google.OCR
         self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
@@ -671,7 +688,7 @@ class VoucherVision():
         json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
         json_report.set_JSON({}, {}, {})
-        llm_model = self.initialize_llm_model(self.logger, MODEL_NAME_FORMATTED, self.JSON_dict_structure, name_parts, is_azure, self.llm)
         for i, path_to_crop in enumerate(self.img_paths):
             self.update_progress_report_batch(progress_report, i)
@@ -729,7 +746,7 @@ class VoucherVision():
             final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out)
-            self.log_completion_info(final_JSON_response)
             json_report.set_JSON(final_JSON_response, final_WFO_record, final_GEO_record)
@@ -741,22 +758,22 @@ class VoucherVision():
     ##################################################################################################################################
     ################################################## LLM Helper Funcs ##############################################################
     ##################################################################################################################################
-    def initialize_llm_model(self, logger, model_name, JSON_dict_structure, name_parts, is_azure=None, llm_object=None):
         if 'LOCAL'in name_parts:
             if ('MIXTRAL' in name_parts) or ('MISTRAL' in name_parts):
                 if 'CPU' in name_parts:
-                    return LocalCPUMistralHandler(logger, model_name, JSON_dict_structure)
                 else:
-                    return LocalMistralHandler(logger, model_name, JSON_dict_structure)
         else:
             if 'PALM2' in name_parts:
-                return GooglePalm2Handler(logger, model_name, JSON_dict_structure)
             elif 'GEMINI' in name_parts:
-                return GoogleGeminiHandler(logger, model_name, JSON_dict_structure)
             elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
-                return MistralHandler(logger, model_name, JSON_dict_structure)
             else:
-                return OpenAIHandler(logger, model_name, JSON_dict_structure, is_azure, llm_object)
     def setup_prompt(self):
         Catalog = PromptCatalog()
@@ -807,11 +824,6 @@ class VoucherVision():
             return final_JSON_response_updated, WFO_record, GEO_record
-    def log_completion_info(self, final_JSON_response):
-        self.logger.info(f'Formatted JSON\n{final_JSON_response}')
-        self.logger.info(f'Finished API calls\n')
     def update_progress_report_final(self, progress_report):
         if progress_report is not None:
             progress_report.reset_batch("Batch Complete")
@@ -839,7 +851,8 @@ class VoucherVision():
         return filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt
-    def save_json_and_xlsx(self, Dirs, response, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out):
         if response is None:
             response = self.JSON_dict_structure
             # Insert 'filename' as the first key

         self.prompt_version = None
         self.is_hf = is_hf
+        # self.trOCR_model_version = "microsoft/trocr-large-handwritten"
         # self.trOCR_model_version = "microsoft/trocr-base-handwritten"
         # self.trOCR_model_version = "dh-unibe/trocr-medieval-escriptmask" # NOPE
         # self.trOCR_model_version = "dh-unibe/trocr-kurrent" # NOPE
         self.logger.name = f'[Transcription]'
         self.logger.info(f'Setting up OCR and LLM')
+        self.trOCR_model_version = self.cfg['leafmachine']['project']['trOCR_model_path']
         self.db_name = self.cfg['leafmachine']['project']['embeddings_database_name']
         self.path_domain_knowledge = self.cfg['leafmachine']['project']['path_to_domain_knowledge_xlsx']
         self.build_new_db = self.cfg['leafmachine']['project']['build_new_embeddings_database']
         self.wfo_headers = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"]
         self.wfo_headers_no_lists = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_placement"]
+        self.utility_headers = ["filename"] + self.wfo_headers + self.geo_headers + self.usage_headers + ["run_name", "prompt", "LLM", "tokens_in", "tokens_out", "LM2_collage", "OCR_method", "OCR_double", "OCR_trOCR", "path_to_crop","path_to_original","path_to_content","path_to_helper",]
                                 # "WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement",
                                 # "GEO_override_OCR", "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat",
                     break
+    def add_data_to_excel_from_response(self, Dirs, path_transcription, response, WFO_record, GEO_record, usage_report,
+                                        MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, path_to_content, path_to_helper, nt_in, nt_out):
         wb = openpyxl.load_workbook(path_transcription)
                 sheet.cell(row=next_row, column=i, value=os.path.basename(self.path_custom_prompts))
             elif header.value == "run_name":
                 sheet.cell(row=next_row, column=i, value=Dirs.run_name)
+            elif header.value == "LM2_collage":
+                sheet.cell(row=next_row, column=i, value=self.cfg['leafmachine']['use_RGB_label_images'])
+            elif header.value == "OCR_method":
+                value_to_insert = self.cfg['leafmachine']['project']['OCR_option']
+                if isinstance(value_to_insert, list):
+                    value_to_insert = '|'.join(map(str, value_to_insert))
+                sheet.cell(row=next_row, column=i, value=value_to_insert)
+            elif header.value == "OCR_double":
+                sheet.cell(row=next_row, column=i, value=self.cfg['leafmachine']['project']['double_OCR'])
+            elif header.value == "OCR_trOCR":
+                sheet.cell(row=next_row, column=i, value=self.cfg['leafmachine']['project']['do_use_trOCR'])
             # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
             elif header.value in self.wfo_headers_no_lists:
                 sheet.cell(row=next_row, column=i, value=WFO_record.get(header.value, ''))
     def has_API_key(self, val):
+        return isinstance(val, str) and bool(val.strip())
+        # if val != '':
+        #     return True
+        # else:
+        #     return False
     def get_google_credentials(self): # Also used for google drive
         self.has_key_openai = self.has_API_key(k_openai)
         self.has_key_azure_openai = self.has_API_key(k_openai_azure)
+        self.llm = None
         self.has_key_google_project_id = self.has_API_key(k_google_project_id)
         self.has_key_google_location = self.has_API_key(k_google_location)
         self.has_key_open_cage_geocode = self.has_API_key(k_opencage)
         ### Google - OCR, Palm2, Gemini
         if self.has_key_google_application_credentials and self.has_key_google_project_id and self.has_key_google_location:
             if self.is_hf:
                 vertexai.init(project=os.getenv('GOOGLE_PROJECT_ID'), location=os.getenv('GOOGLE_LOCATION'), credentials=self.get_google_credentials())
             else:
                 vertexai.init(project=k_google_project_id, location=k_google_location, credentials=self.get_google_credentials())
+                os.environ['GOOGLE_API_KEY'] = self.cfg_private['google']['GOOGLE_PALM_API']
         ### OpenAI
         if self.has_key_openai:
                     azure_endpoint = os.getenv('AZURE_API_BASE'),
                     openai_organization = os.getenv('AZURE_ORGANIZATION'),
                 )
             else:
                 # Initialize the Azure OpenAI client
                     azure_endpoint = self.cfg_private['openai_azure']['OPENAI_API_BASE'],
                     openai_organization = self.cfg_private['openai_azure']['OPENAI_ORGANIZATION'],
                 )
         ### Mistral
         ocr_google = OCREngine(self.logger, json_report, self.dir_home, self.is_hf, self.path_to_crop, self.cfg, self.trOCR_model_version, self.trOCR_model, self.trOCR_processor, self.device)
         ocr_google.process_image(self.do_create_OCR_helper_image, self.logger)
         self.OCR = ocr_google.OCR
+        self.logger.info(f"Complete OCR text for LLM prompt:\n\n{self.OCR}\n\n")
         self.write_json_to_file(txt_file_path_OCR, ocr_google.OCR_JSON_to_file)
         json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
         json_report.set_JSON({}, {}, {})
+        llm_model = self.initialize_llm_model(self.cfg, self.logger, MODEL_NAME_FORMATTED, self.JSON_dict_structure, name_parts, is_azure, self.llm)
         for i, path_to_crop in enumerate(self.img_paths):
             self.update_progress_report_batch(progress_report, i)
             final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out)
+            self.logger.info(f'Finished LLM call')
             json_report.set_JSON(final_JSON_response, final_WFO_record, final_GEO_record)
     ##################################################################################################################################
     ################################################## LLM Helper Funcs ##############################################################
     ##################################################################################################################################
+    def initialize_llm_model(self, cfg, logger, model_name, JSON_dict_structure, name_parts, is_azure=None, llm_object=None):
         if 'LOCAL'in name_parts:
             if ('MIXTRAL' in name_parts) or ('MISTRAL' in name_parts):
                 if 'CPU' in name_parts:
+                    return LocalCPUMistralHandler(cfg, logger, model_name, JSON_dict_structure)
                 else:
+                    return LocalMistralHandler(cfg, logger, model_name, JSON_dict_structure)
         else:
             if 'PALM2' in name_parts:
+                return GooglePalm2Handler(cfg, logger, model_name, JSON_dict_structure)
             elif 'GEMINI' in name_parts:
+                return GoogleGeminiHandler(cfg, logger, model_name, JSON_dict_structure)
             elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
+                return MistralHandler(cfg, logger, model_name, JSON_dict_structure)
             else:
+                return OpenAIHandler(cfg, logger, model_name, JSON_dict_structure, is_azure, llm_object)
     def setup_prompt(self):
         Catalog = PromptCatalog()
             return final_JSON_response_updated, WFO_record, GEO_record
     def update_progress_report_final(self, progress_report):
         if progress_report is not None:
             progress_report.reset_batch("Batch Complete")
         return filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt
+    def save_json_and_xlsx(self, Dirs, response, WFO_record, GEO_record, usage_report,
+                           MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out):
         if response is None:
             response = self.JSON_dict_structure
             # Insert 'filename' as the first key