Spaces:

phyloforfun
/

VoucherVision

Running

App Files Files Community

phyloforfun commited on Feb 1, 2024

Commit

9d06861

1 Parent(s): 4d14f52

Major update. Support for 15 LLMs, World Flora Online taxonomy validation, geolocation, 2 OCR methods, significant UI changes, stability improvements, consistent JSON parsing

Browse files

Files changed (19) hide show

app.py +6 -6
custom_prompts/SLTPvA_long.yaml +22 -22
custom_prompts/SLTPvA_medium.yaml +22 -22
custom_prompts/SLTPvA_short.yaml +22 -22
requirements.txt +0 -0
vouchervision/LLM_GoogleGemini.py +18 -9
vouchervision/LLM_GooglePalm2.py +18 -9
vouchervision/LLM_MistralAI.py +20 -10
vouchervision/LLM_OpenAI.py +19 -10
vouchervision/LLM_local_MistralAI.py +21 -11
vouchervision/LLM_local_cpu_MistralAI.py +20 -10
vouchervision/directory_structure_VV.py +10 -0
vouchervision/prompt_catalog.py +18 -1
vouchervision/tool_wikipedia.py +581 -0
vouchervision/utils_LLM.py +47 -3
vouchervision/utils_LLM_JSON_validation.py +7 -5
vouchervision/utils_VoucherVision.py +35 -19
vouchervision/utils_hf.py +55 -37
vouchervision/vouchervision_main.py +18 -10

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ st.set_page_config(layout="wide", page_icon='img/icon.ico', page_title='VoucherV
 # Parse the 'is_hf' argument and set it in session state
 if 'is_hf' not in st.session_state:
-    st.session_state['is_hf'] = True
 ########################################################################################################
@@ -141,7 +141,8 @@ def content_input_images(col_left, col_right):
                 pass
             elif not st.session_state['view_local_gallery'] and not st.session_state['input_list_small'] and (st.session_state['dir_images_local_TEMP'] == st.session_state.config['leafmachine']['project']['dir_images_local']):
                 pass
-            elif st.session_state['input_list_small'] and (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
                 dir_images_local = st.session_state.config['leafmachine']['project']['dir_images_local']
                 count_n_imgs = list_jpg_files(dir_images_local)
                 st.session_state['processing_add_on'] = count_n_imgs
@@ -1012,8 +1013,6 @@ def save_prompt_yaml(filename, col):
     st.success(f"Prompt saved as '{filename}.yaml'.")
-    upload_to_drive(filepath, filename) # added
     with col: # added
         create_download_button_yaml(filepath, filename,key_val=2456237465) # added
@@ -1363,7 +1362,7 @@ def build_LLM_prompt_config():
             # This assumes that the column names are the keys in the dictionary under 'rules'
             all_column_names = list(st.session_state['rules'].keys())
-            categories = ['TAXONOMY', 'GEOGRAPHY', 'LOCALITY', 'COLLECTING', 'MISCELLANEOUS']
             if ('mapping' not in st.session_state) or (st.session_state['mapping'] == {}):
                 st.session_state['mapping'] = {category: [] for category in categories}
             for category in categories:
@@ -1751,6 +1750,7 @@ def content_header():
                                                                                                                 path_api_cost=os.path.join(st.session_state.dir_home,'api_cost','api_cost.yaml'),
                                                                                                                 is_hf = st.session_state['is_hf'],
                                                                                                                 is_real_run=True)
                     st.balloons()
                 except Exception as e:
                     with col_run_4:
@@ -2020,7 +2020,7 @@ def content_collage_overlay():
     with col_collage:
         st.header('LeafMachine2 Label Collage')
         default_crops = st.session_state.config['leafmachine']['cropped_components']['save_cropped_annotations']
-        st.write("Prior to transcription, use LeafMachine2 to crop all labels from input images to create label collages for each specimen image. (Requires GPU)")
         st.session_state.config['leafmachine']['use_RGB_label_images'] = st.checkbox("Use LeafMachine2 label collage for transcriptions", st.session_state.config['leafmachine'].get('use_RGB_label_images', False))

 # Parse the 'is_hf' argument and set it in session state
 if 'is_hf' not in st.session_state:
+    st.session_state['is_hf'] = False
 ########################################################################################################
                 pass
             elif not st.session_state['view_local_gallery'] and not st.session_state['input_list_small'] and (st.session_state['dir_images_local_TEMP'] == st.session_state.config['leafmachine']['project']['dir_images_local']):
                 pass
+            # elif st.session_state['input_list_small'] and (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
+            elif (st.session_state['dir_images_local_TEMP'] != st.session_state.config['leafmachine']['project']['dir_images_local']):
                 dir_images_local = st.session_state.config['leafmachine']['project']['dir_images_local']
                 count_n_imgs = list_jpg_files(dir_images_local)
                 st.session_state['processing_add_on'] = count_n_imgs
     st.success(f"Prompt saved as '{filename}.yaml'.")
     with col: # added
         create_download_button_yaml(filepath, filename,key_val=2456237465) # added
             # This assumes that the column names are the keys in the dictionary under 'rules'
             all_column_names = list(st.session_state['rules'].keys())
+            categories = ['TAXONOMY', 'GEOGRAPHY', 'LOCALITY', 'COLLECTING', 'MISC']
             if ('mapping' not in st.session_state) or (st.session_state['mapping'] == {}):
                 st.session_state['mapping'] = {category: [] for category in categories}
             for category in categories:
                                                                                                                 path_api_cost=os.path.join(st.session_state.dir_home,'api_cost','api_cost.yaml'),
                                                                                                                 is_hf = st.session_state['is_hf'],
                                                                                                                 is_real_run=True)
                     st.balloons()
                 except Exception as e:
                     with col_run_4:
     with col_collage:
         st.header('LeafMachine2 Label Collage')
         default_crops = st.session_state.config['leafmachine']['cropped_components']['save_cropped_annotations']
+        st.write("Prior to transcription, use LeafMachine2 to crop all labels from input images to create label collages for each specimen image. Showing just the text labels to the OCR algorithms significantly improves performance. This runs slowly on the free Hugging Face Space, but runs quickly with a fast CPU or any GPU.")
         st.session_state.config['leafmachine']['use_RGB_label_images'] = st.checkbox("Use LeafMachine2 label collage for transcriptions", st.session_state.config['leafmachine'].get('use_RGB_label_images', False))

custom_prompts/SLTPvA_long.yaml CHANGED Viewed

@@ -78,35 +78,35 @@ rules:
       are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m"
       or "m." or "meters"). Round to integer.
 mapping:
-  COLLECTING:
-  - identifiedBy
-  - recordedBy
-  - recordNumber
-  - verbatimEventDate
-  - eventDate
   GEOGRAPHY:
   - country
   - stateProvince
   - county
   - municipality
-  - minimumElevationInMeters
-  - maximumElevationInMeters
-  LOCALITY:
-  - locality
-  - habitat
   - decimalLatitude
   - decimalLongitude
   - verbatimCoordinates
-  MISCELLANEOUS:
   - degreeOfEstablishment
   - occurrenceRemarks
-  TAXONOMY:
-  - catalogNumber
-  - order
-  - family
-  - scientificName
-  - scientificNameAuthorship
-  - genus
-  - subgenus
-  - specificEpithet
-  - infraspecificEpithet

       are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m"
       or "m." or "meters"). Round to integer.
 mapping:
+  TAXONOMY:
+  - catalogNumber
+  - order
+  - family
+  - scientificName
+  - scientificNameAuthorship
+  - genus
+  - subgenus
+  - specificEpithet
+  - infraspecificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
   - county
   - municipality
   - decimalLatitude
   - decimalLongitude
   - verbatimCoordinates
+  LOCALITY:
+  - locality
+  - habitat
+  - minimumElevationInMeters
+  - maximumElevationInMeters
+  COLLECTING:
+  - identifiedBy
+  - recordedBy
+  - recordNumber
+  - verbatimEventDate
+  - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
+  MISC:

custom_prompts/SLTPvA_medium.yaml CHANGED Viewed

@@ -53,35 +53,35 @@ rules:
   minimumElevationInMeters: Minimum elevation or altitude in meters. Only if units are explicit then convert from feet ("ft" or "ft."" or "feet") to meters ("m" or "m." or "meters"). Round to integer.
   maximumElevationInMeters: Maximum elevation or altitude in meters. If only one elevation is present, then max_elevation should be set to the null_value. Only if units are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m" or "m." or "meters"). Round to integer.
 mapping:
-  COLLECTING:
-  - identifiedBy
-  - recordedBy
-  - recordNumber
-  - verbatimEventDate
-  - eventDate
   GEOGRAPHY:
   - country
   - stateProvince
   - county
   - municipality
-  - minimumElevationInMeters
-  - maximumElevationInMeters
-  LOCALITY:
-  - locality
-  - habitat
   - decimalLatitude
   - decimalLongitude
   - verbatimCoordinates
-  MISCELLANEOUS:
   - degreeOfEstablishment
   - occurrenceRemarks
-  TAXONOMY:
-  - catalogNumber
-  - order
-  - family
-  - scientificName
-  - scientificNameAuthorship
-  - genus
-  - subgenus
-  - specificEpithet
-  - infraspecificEpithet

   minimumElevationInMeters: Minimum elevation or altitude in meters. Only if units are explicit then convert from feet ("ft" or "ft."" or "feet") to meters ("m" or "m." or "meters"). Round to integer.
   maximumElevationInMeters: Maximum elevation or altitude in meters. If only one elevation is present, then max_elevation should be set to the null_value. Only if units are explicit then convert from feet ("ft" or "ft." or "feet") to meters ("m" or "m." or "meters"). Round to integer.
 mapping:
+  TAXONOMY:
+  - catalogNumber
+  - order
+  - family
+  - scientificName
+  - scientificNameAuthorship
+  - genus
+  - subgenus
+  - specificEpithet
+  - infraspecificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
   - county
   - municipality
   - decimalLatitude
   - decimalLongitude
   - verbatimCoordinates
+  LOCALITY:
+  - locality
+  - habitat
+  - minimumElevationInMeters
+  - maximumElevationInMeters
+  COLLECTING:
+  - identifiedBy
+  - recordedBy
+  - recordNumber
+  - verbatimEventDate
+  - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
+  MISC:

custom_prompts/SLTPvA_short.yaml CHANGED Viewed

@@ -48,35 +48,35 @@ rules:
   minimumElevationInMeters: minimum elevation or altitude in meters.
   maximumElevationInMeters: maximum elevation or altitude in meters.
 mapping:
-  COLLECTING:
-  - identifiedBy
-  - recordedBy
-  - recordNumber
-  - verbatimEventDate
-  - eventDate
   GEOGRAPHY:
   - country
   - stateProvince
   - county
   - municipality
-  - minimumElevationInMeters
-  - maximumElevationInMeters
-  LOCALITY:
-  - locality
-  - habitat
   - decimalLatitude
   - decimalLongitude
   - verbatimCoordinates
-  MISCELLANEOUS:
   - degreeOfEstablishment
   - occurrenceRemarks
-  TAXONOMY:
-  - catalogNumber
-  - order
-  - family
-  - scientificName
-  - scientificNameAuthorship
-  - genus
-  - subgenus
-  - specificEpithet
-  - infraspecificEpithet

   minimumElevationInMeters: minimum elevation or altitude in meters.
   maximumElevationInMeters: maximum elevation or altitude in meters.
 mapping:
+  TAXONOMY:
+  - catalogNumber
+  - order
+  - family
+  - scientificName
+  - scientificNameAuthorship
+  - genus
+  - subgenus
+  - specificEpithet
+  - infraspecificEpithet
   GEOGRAPHY:
   - country
   - stateProvince
   - county
   - municipality
   - decimalLatitude
   - decimalLongitude
   - verbatimCoordinates
+  LOCALITY:
+  - locality
+  - habitat
+  - minimumElevationInMeters
+  - maximumElevationInMeters
+  COLLECTING:
+  - identifiedBy
+  - recordedBy
+  - recordNumber
+  - verbatimEventDate
+  - eventDate
   - degreeOfEstablishment
   - occurrenceRemarks
+  MISC:

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

vouchervision/LLM_GoogleGemini.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, time
 import vertexai
 from vertexai.preview.generative_models import GenerativeModel
 from vertexai.generative_models._generative_models import HarmCategory, HarmBlockThreshold
@@ -9,10 +9,11 @@ from langchain_core.output_parsers import JsonOutputParser
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
 class GoogleGeminiHandler:
@@ -95,7 +96,8 @@ class GoogleGeminiHandler:
                                         safety_settings=self.safety_settings)
         return response.text
-    def call_llm_api_GoogleGemini(self, prompt_template, json_report):
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
@@ -125,19 +127,26 @@ class GoogleGeminiHandler:
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
-                        json_report.set_text(text_main=f'Working on WFO and Geolocation')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        self.logger.info(f"Formatted JSON: {output}")
-                        self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
-                        return output, nt_in, nt_out, WFO_record, GEO_record
             except Exception as e:
                 self.logger.error(f'{e}')
@@ -148,10 +157,10 @@ class GoogleGeminiHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
-        self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
-        return None, nt_in, nt_out, None, None

+import os, time, json
 import vertexai
 from vertexai.preview.generative_models import GenerativeModel
 from vertexai.generative_models._generative_models import HarmCategory, HarmBlockThreshold
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
+from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import WikipediaLinks
 class GoogleGeminiHandler:
                                         safety_settings=self.safety_settings)
         return response.text
+    def call_llm_api_GoogleGemini(self, prompt_template, json_report, paths):
+        _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
+                        self.monitor.stop_inference_timer() # Starts tool timer too
+                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
+                        Wiki = WikipediaLinks(json_file_path_wiki)
+                        Wiki.gather_wikipedia_results(output)
+                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
+                        self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
+                        usage_report = self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
+                        return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 self.logger.error(f'{e}')
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
+        return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_GooglePalm2.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, time
 import vertexai
 from vertexai.language_models import TextGenerationModel
 from vertexai.generative_models._generative_models import HarmCategory, HarmBlockThreshold
@@ -11,10 +11,11 @@ from langchain_core.output_parsers import JsonOutputParser
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
 #https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk
 #pip install --upgrade google-cloud-aiplatform
@@ -109,7 +110,8 @@ class GooglePalm2Handler:
         return response.text
-    def call_llm_api_GooglePalm2(self, prompt_template, json_report):
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
@@ -139,19 +141,26 @@ class GooglePalm2Handler:
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
-                        json_report.set_text(text_main=f'Working on WFO and Geolocation')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        self.logger.info(f"Formatted JSON: {output}")
-                        self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
-                        return output, nt_in, nt_out, WFO_record, GEO_record
             except Exception as e:
                 self.logger.error(f'{e}')
@@ -162,8 +171,8 @@ class GooglePalm2Handler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
-        self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
-        return None, nt_in, nt_out, None, None

+import os, time, json
 import vertexai
 from vertexai.language_models import TextGenerationModel
 from vertexai.generative_models._generative_models import HarmCategory, HarmBlockThreshold
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_google_vertexai import VertexAI
+from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import WikipediaLinks
 #https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk
 #pip install --upgrade google-cloud-aiplatform
         return response.text
+    def call_llm_api_GooglePalm2(self, prompt_template, json_report, paths):
+        _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
+                        self.monitor.stop_inference_timer() # Starts tool timer too
+                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
+                        Wiki = WikipediaLinks(json_file_path_wiki)
+                        Wiki.gather_wikipedia_results(output)
+                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
+                        self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
+                        usage_report = self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
+                        return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 self.logger.error(f'{e}')
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
+        return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_MistralAI.py CHANGED Viewed

@@ -1,13 +1,14 @@
-import os, time, random, torch
 from langchain_mistralai.chat_models import ChatMistralAI
 from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
 class MistralHandler:
@@ -78,7 +79,9 @@ class MistralHandler:
         self.chain = self.prompt | self.llm_model
-    def call_llm_api_MistralAI(self, prompt_template, json_report):
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
@@ -109,22 +112,29 @@ class MistralHandler:
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
-                        json_report.set_text(text_main=f'Working on WFO and Geolocation')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        self.logger.info(f"Formatted JSON: {output}")
-                        self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
-                        return output, nt_in, nt_out, WFO_record, GEO_record
             except Exception as e:
-                self.logger.error(f'{e}')
                 self._adjust_config()
                 time.sleep(self.RETRY_DELAY)
@@ -132,8 +142,8 @@ class MistralHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
-        self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
-        return None, nt_in, nt_out, None, None

+import os, time, random, torch, json
 from langchain_mistralai.chat_models import ChatMistralAI
 from langchain.output_parsers import RetryWithErrorOutputParser
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import JsonOutputParser
+from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import WikipediaLinks
 class MistralHandler:
         self.chain = self.prompt | self.llm_model
+    def call_llm_api_MistralAI(self, prompt_template, json_report, paths):
+        _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response}')
                         self._adjust_config()
                     else:
+                        self.monitor.stop_inference_timer() # Starts tool timer too
+                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
+                        Wiki = WikipediaLinks(json_file_path_wiki)
+                        Wiki.gather_wikipedia_results(output)
+                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
+                        self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
+                        usage_report = self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
+                        return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
+                self.logger.error(f'JSON Parsing Error (LangChain): {e}')
                 self._adjust_config()
                 time.sleep(self.RETRY_DELAY)
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
+        return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_OpenAI.py CHANGED Viewed

@@ -1,14 +1,15 @@
-import time, torch
 from langchain.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI, OpenAI
 from langchain.schema import HumanMessage
 from langchain_core.output_parsers import JsonOutputParser
 from langchain.output_parsers import RetryWithErrorOutputParser
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
 class OpenAIHandler:
     RETRY_DELAY = 10  # Wait 10 seconds before retrying
@@ -98,7 +99,8 @@ class OpenAIHandler:
             self.chain = self.prompt | (self.format_input_for_azure if self.is_azure else ChatOpenAI(model=self.model_name))
-    def call_llm_api_OpenAI(self, prompt_template, json_report):
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
@@ -130,19 +132,26 @@ class OpenAIHandler:
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response_text}')
                         self._adjust_config()
                     else:
-                        json_report.set_text(text_main=f'Working on WFO and Geolocation')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
-                        self.logger.info(f"Formatted JSON: {output}")
-                        self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
-                        return output, nt_in, nt_out, WFO_record, GEO_record
             except Exception as e:
                 self.logger.error(f'{e}')
@@ -153,8 +162,8 @@ class OpenAIHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
-        self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
-        return None, nt_in, nt_out, None, None

+import time, torch, json
 from langchain.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI, OpenAI
 from langchain.schema import HumanMessage
 from langchain_core.output_parsers import JsonOutputParser
 from langchain.output_parsers import RetryWithErrorOutputParser
+from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import WikipediaLinks
 class OpenAIHandler:
     RETRY_DELAY = 10  # Wait 10 seconds before retrying
             self.chain = self.prompt | (self.format_input_for_azure if self.is_azure else ChatOpenAI(model=self.model_name))
+    def call_llm_api_OpenAI(self, prompt_template, json_report, paths):
+        _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{response_text}')
                         self._adjust_config()
                     else:
+                        self.monitor.stop_inference_timer() # Starts tool timer too
+                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
                         output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
                         output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
+                        Wiki = WikipediaLinks(json_file_path_wiki)
+                        Wiki.gather_wikipedia_results(output)
+                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
+                        self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
+                        usage_report = self.monitor.stop_monitoring_report_usage()
                         if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
+                        return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 self.logger.error(f'{e}')
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
+        return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_local_MistralAI.py CHANGED Viewed

@@ -6,10 +6,11 @@ from langchain_core.output_parsers import JsonOutputParser
 from huggingface_hub import hf_hub_download
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
 '''
 Local Pipielines:
@@ -147,7 +148,8 @@ class LocalMistralHandler:
         self.chain = self.prompt | self.local_model  # LCEL
-    def call_llm_local_MistralAI(self, prompt_template, json_report):
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
@@ -183,20 +185,28 @@ class LocalMistralHandler:
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{results}')
                         self._adjust_config()
                     else:
-                        json_report.set_text(text_main=f'Working on WFO and Geolocation')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) # Make configurable if needed
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) # Make configurable if needed
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
-                        self.monitor.stop_monitoring_report_usage()
-                        if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
                         del results
-                        return output, nt_in, nt_out, WFO_record, GEO_record
             except Exception as e:
                 self.logger.error(f'{e}')
                 self._adjust_config()
@@ -204,9 +214,9 @@ class LocalMistralHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
-        self.monitor.stop_monitoring_report_usage()
         json_report.set_text(text_main=f'LLM call failed')
         self._reset_config()
-        return None, nt_in, nt_out, None, None

 from huggingface_hub import hf_hub_download
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
+from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import WikipediaLinks
 '''
 Local Pipielines:
         self.chain = self.prompt | self.local_model  # LCEL
+    def call_llm_local_MistralAI(self, prompt_template, json_report, paths):
+        _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{results}')
                         self._adjust_config()
                     else:
+                        self.monitor.stop_inference_timer() # Starts tool timer too
+                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
+                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
+                        Wiki = WikipediaLinks(json_file_path_wiki)
+                        Wiki.gather_wikipedia_results(output)
+                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
+                        usage_report = self.monitor.stop_monitoring_report_usage()
+                        if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
                         del results
+                        return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 self.logger.error(f'{e}')
                 self._adjust_config()
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        usage_report = self.monitor.stop_monitoring_report_usage()
         json_report.set_text(text_main=f'LLM call failed')
         self._reset_config()
+        return None, nt_in, nt_out, None, None, usage_report

vouchervision/LLM_local_cpu_MistralAI.py CHANGED Viewed

@@ -18,10 +18,11 @@ from langchain.callbacks.base import BaseCallbackHandler
 from huggingface_hub import hf_hub_download
-from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
 class LocalCPUMistralHandler:
     RETRY_DELAY = 2  # Wait 2 seconds before retrying
@@ -136,7 +137,8 @@ class LocalCPUMistralHandler:
         self.chain = self.prompt | self.local_model
-    def call_llm_local_cpu_MistralAI(self, prompt_template, json_report):
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
@@ -176,18 +178,26 @@ class LocalCPUMistralHandler:
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{results}')
                         self._adjust_config()
                     else:
-                        json_report.set_text(text_main=f'Working on WFO and Geolocation')
-                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) # Make configurable if needed
-                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) # Make configurable if needed
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
-                        self.monitor.stop_monitoring_report_usage()
-                        if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
-                        return output, nt_in, nt_out, WFO_record, GEO_record
             except Exception as e:
                 self.logger.error(f'{e}')
@@ -196,10 +206,10 @@ class LocalCPUMistralHandler:
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
-        self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
-        return None, nt_in, nt_out, None, None

 from huggingface_hub import hf_hub_download
+from vouchervision.utils_LLM import SystemLoadMonitor, count_tokens, save_individual_prompt
 from vouchervision.utils_LLM_JSON_validation import validate_and_align_JSON_keys_with_template
 from vouchervision.utils_taxonomy_WFO import validate_taxonomy_WFO
 from vouchervision.utils_geolocate_HERE import validate_coordinates_here
+from vouchervision.tool_wikipedia import WikipediaLinks
 class LocalCPUMistralHandler:
     RETRY_DELAY = 2  # Wait 2 seconds before retrying
         self.chain = self.prompt | self.local_model
+    def call_llm_local_cpu_MistralAI(self, prompt_template, json_report, paths):
+        _____, ____, _, __, ___, json_file_path_wiki, txt_file_path_ind_prompt = paths
         self.json_report = json_report
         self.json_report.set_text(text_main=f'Sending request to {self.model_name}')
         self.monitor.start_monitoring_usage()
                         self.logger.error(f'[Attempt {ind}] Failed to extract JSON from:\n{results}')
                         self._adjust_config()
                     else:
+                        self.monitor.stop_inference_timer() # Starts tool timer too
+                        json_report.set_text(text_main=f'Working on WFO, Geolocation, Links')
+                        output, WFO_record = validate_taxonomy_WFO(output, replace_if_success_wfo=False) ###################################### make this configurable
+                        output, GEO_record = validate_coordinates_here(output, replace_if_success_geo=False) ###################################### make this configurable
+                        Wiki = WikipediaLinks(json_file_path_wiki)
+                        Wiki.gather_wikipedia_results(output)
+                        save_individual_prompt(Wiki.sanitize(prompt_template), txt_file_path_ind_prompt)
                         self.logger.info(f"Formatted JSON:\n{json.dumps(output,indent=4)}")
+                        usage_report = self.monitor.stop_monitoring_report_usage()
+                        if self.adjust_temp != self.starting_temp:
                             self._reset_config()
                         json_report.set_text(text_main=f'LLM call successful')
+                        return output, nt_in, nt_out, WFO_record, GEO_record, usage_report
             except Exception as e:
                 self.logger.error(f'{e}')
         self.logger.info(f"Failed to extract valid JSON after [{ind}] attempts")
         self.json_report.set_text(text_main=f'Failed to extract valid JSON after [{ind}] attempts')
+        usage_report = self.monitor.stop_monitoring_report_usage()
         self._reset_config()
         json_report.set_text(text_main=f'LLM call failed')
+        return None, nt_in, nt_out, None, None, usage_report

vouchervision/directory_structure_VV.py CHANGED Viewed

@@ -92,6 +92,16 @@ class Dir_Structure():
         self.transcription_ind_OCR_helper = os.path.join(self.dir_project,'Transcription','Individual_OCR_Helper')
         validate_dir(self.transcription_ind_OCR_helper)
         self.save_original = os.path.join(self.dir_project,'Original_Images')
         validate_dir(self.save_original)

         self.transcription_ind_OCR_helper = os.path.join(self.dir_project,'Transcription','Individual_OCR_Helper')
         validate_dir(self.transcription_ind_OCR_helper)
+        self.transcription_ind_wiki = os.path.join(self.dir_project,'Transcription','Individual_Wikipedia')
+        validate_dir(self.transcription_ind_wiki)
+        self.transcription_ind_prompt = os.path.join(self.dir_project,'Transcription','Individual_Prompt')
+        validate_dir(self.transcription_ind_prompt)
+        self.transcription_prompt = os.path.join(self.dir_project,'Transcription','Prompt_Template')
+        validate_dir(self.transcription_prompt)
         self.save_original = os.path.join(self.dir_project,'Original_Images')
         validate_dir(self.save_original)

vouchervision/prompt_catalog.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from langchain_core.pydantic_v1 import Field, create_model
-import yaml, json
 @dataclass
 class PromptCatalog:
@@ -69,6 +69,23 @@ class PromptCatalog:
         # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
         return prompt, self.dictionary_structure
     def load_rules_config(self):
         with open(self.rules_config_path, 'r') as stream:
             try:

 from dataclasses import dataclass
 from langchain_core.pydantic_v1 import Field, create_model
+import yaml, json, os, shutil
 @dataclass
 class PromptCatalog:
         # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
         return prompt, self.dictionary_structure
+    def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path):
+        # Ensure the target directory exists, create it if it doesn't
+        if not os.path.exists(new_directory_path):
+            os.makedirs(new_directory_path)
+        # Define the path for the new file location
+        new_file_path = os.path.join(new_directory_path, os.path.basename(rules_config_path))
+        # Copy the file to the new location
+        try:
+            shutil.copy(rules_config_path, new_file_path)
+            print(f"Prompt [{os.path.basename(rules_config_path)}] copied successfully to {new_file_path}")
+        except Exception as exc:
+            print(f"Error copying [{os.path.basename(rules_config_path)}] file: {exc}")
     def load_rules_config(self):
         with open(self.rules_config_path, 'r') as stream:
             try:

vouchervision/tool_wikipedia.py ADDED Viewed

	@@ -0,0 +1,581 @@

+import itertools, yaml,wikipediaapi, requests, re, json
+from langchain_community.tools import WikipediaQueryRun
+from langchain_community.utilities import WikipediaAPIWrapper
+# from langchain_community.tools.wikidata.tool import WikidataAPIWrapper, WikidataQueryRun
+class WikipediaLinks():
+    def __init__(self, json_file_path_wiki) -> None:
+        self.json_file_path_wiki = json_file_path_wiki
+        self.wiki_wiki = wikipediaapi.Wikipedia(
+            user_agent='VoucherVision ([email protected])',
+            language='en'
+        )
+        self.property_to_rank = {
+            'P225': 'Species',
+            'P171': 'Family',
+            'P105': 'Taxon rank',
+            'P70': 'Genus',
+            'P75': 'Clade',
+            'P76': 'Subgenus',
+            'P67': 'Subfamily',
+            'P66': 'Tribe',
+            'P71': 'Subtribe',
+            'P61': 'Order',
+            'P72': 'Suborder',
+            'P73': 'Infraorder',
+            'P74': 'Superfamily',
+            'P142': 'Phylum',
+            'P75': 'Clade',
+            'P76': 'Subclass',
+            'P77': 'Infraclass',
+            'P78': 'Superorder',
+            'P81': 'Class',
+            'P82': 'Superclass',
+            'P84': 'Kingdom',
+            'P85': 'Superkingdom',
+            'P86': 'Subkingdom',
+            'P87': 'Infrakingdom',
+            'P88': 'Parvkingdom',
+            'P89': 'Domain',
+            'P1421': 'GRIN',
+            'P1070': 'KEW',
+            'P5037': 'POWOID',
+        }
+    def get_label_for_entity_id(self, entity_id):
+        url = "https://www.wikidata.org/w/api.php"
+        params = {
+            "action": "wbgetentities",
+            "format": "json",
+            "ids": entity_id,
+            "props": "labels",
+            "languages": "en"  # Assuming you want the label in English
+        }
+        response = requests.get(url, params=params)
+        data = response.json()
+        return data['entities'][entity_id]['labels']['en']['value'] if 'en' in data['entities'][entity_id]['labels'] else None
+    def is_valid_url(self, url):
+        try:
+            response = requests.head(url, allow_redirects=True, timeout=5)
+            # If the response status code is 200, the URL is reachable
+            return response.status_code == 200
+        except requests.RequestException as e:
+            # If there was some issue with the request, such as the domain does not exist
+            # print(f"URL {url} is not reachable. Error: {e}")
+            return False
+    # def get_infobar_data(self, wiki_page_title):
+    #     # Step 1: Extract the Wikidata Item ID from the Wikipedia page
+    #     wiki_api_url = "https://en.wikipedia.org/w/api.php"
+    #     wiki_params = {
+    #         "action": "query",
+    #         "format": "json",
+    #         "titles": wiki_page_title,
+    #         "prop": "revisions",
+    #         "rvprop": "content",
+    #         "rvslots": "*"
+    #     }
+    #     wiki_response = requests.get(wiki_api_url, params=wiki_params)
+    #     wiki_data = wiki_response.json()
+    #     page_key = next(iter(wiki_data['query']['pages']))
+    #     content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']
+    #     infobox_pattern = re.compile(r'\{\{Infobox.*?\|title\}\}', re.DOTALL)
+    #     match = infobox_pattern.search(content)
+    #     if match:
+    #         wikidata_id =  match.group(1)  # Returns the full match including the 'Infobox' braces
+    #     else:
+    #         return "Infobox not found"
+    #     # Step 2: Fetch Data from Wikidata Using the Extracted ID
+    #     wikidata_api_url = "https://www.wikidata.org/w/api.php"
+    #     wikidata_params = {
+    #         "action": "wbgetentities",
+    #         "format": "json",
+    #         "ids": wikidata_id,
+    #         "props": "claims"  # Adjust as needed to fetch the desired data
+    #     }
+    #     wikidata_response = requests.get(wikidata_api_url, params=wikidata_params)
+    #     wikidata_content = wikidata_response.json()
+    #     classification_full = {}
+    #     classification = {}
+    #     label_cache = {}  # Cache for labels
+    #     # Turn this on to see the available properties to decode
+    #     for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
+    #         # Assuming the main snak value is what we want
+    #         value = claims[0]['mainsnak']['datavalue']['value']
+    #         if isinstance(value, dict):  # If the value is an entity ID
+    #             # entity_id = value['id']
+    #             # entity_id = value['id']
+    #             if prop_id not in label_cache:
+    #                 label_cache[prop_id] = self.get_label_for_entity_id(prop_id)
+    #             classification_full[prop_id] = label_cache[prop_id]
+    #         else:
+    #             classification_full[prop_id] = value
+    #     print(classification_full)
+        # Map Wikidata properties to the corresponding taxonomic ranks
+    def convert_to_decimal(self, coord_parts):
+        lat_deg, lat_min, lat_dir, lon_deg, lon_min, lon_dir = coord_parts[:6]
+        lat = float(lat_deg) + float(lat_min) / 60
+        lon = float(lon_deg) + float(lon_min) / 60
+        if lat_dir == 'S':
+            lat = -lat
+        if lon_dir == 'W':
+            lon = -lon
+        return f"{lat},{lon}"
+    def extract_coordinates_and_region(self, coord_string):
+        # Extract the coordinate parts and region info
+        coord_parts = re.findall(r'(\d+|\w+)', coord_string)
+        region_info = re.search(r'region:([^|]+)\|display', coord_string)
+        if coord_parts and len(coord_parts) >= 6:
+            # Convert to decimal coordinates
+            decimal_coords = self.convert_to_decimal(coord_parts)
+        else:
+            decimal_coords = "Invalid coordinates format"
+        region = region_info.group(1) if region_info else "Region not found"
+        return decimal_coords, region
+    def parse_infobox(self, infobox_string):
+        # Split the string into lines
+        lines = infobox_string.split('\n')
+        # Dictionary to store the extracted data
+        infobox_data = {}
+        # Iterate over each line
+        for line in lines:
+            # Split the line into key and value
+            parts = line.split('=', 1)
+            # If the line is properly formatted with a key and value
+            if len(parts) == 2:
+                key = parts[0].strip()
+                key = key.split(' ')[1]
+                value = parts[1].strip()
+                # Handling special cases like links or coordinates
+                if value.startswith('[[') and value.endswith(']]'):
+                    # Extracting linked article titles
+                    value = value[2:-2].split('|')[0]
+                elif value.startswith('{{coord') and value.endswith('}}'):
+                    # Extracting coordinates
+                    value = value[7:-2]
+                elif value.startswith('[') and value.endswith(']') and ('http' in value):
+                    value = value[1:-1]
+                    url_parts = value.split(" ")
+                    infobox_data['url_location'] = next((part for part in url_parts if 'http' in part), None)
+                if key == 'coordinates':
+                    decimal_coordinates, region = self.extract_coordinates_and_region(value)
+                    infobox_data['region'] = region
+                    infobox_data['decimal_coordinates'] = decimal_coordinates
+                key = self.sanitize(key)
+                value = self.sanitize(value)
+                value = self.remove_html_and_wiki_markup(value)
+                # Add to dictionary
+                infobox_data[key] = value
+        return infobox_data
+    def get_infobox_data(self, wiki_page_title, opt=None):
+        wiki_api_url = "https://en.wikipedia.org/w/api.php"
+        wiki_params = {
+            "action": "query",
+            "format": "json",
+            "titles": wiki_page_title,
+            "prop": "revisions",
+            "rvprop": "content",
+            "rvslots": "*"
+        }
+        try:
+            wiki_response = requests.get(wiki_api_url, params=wiki_params)
+            wiki_response.raise_for_status()  # Check for HTTP errors
+        except requests.RequestException as e:
+            return f"Error fetching data: {e}"
+        wiki_data = wiki_response.json()
+        page_key = next(iter(wiki_data['query']['pages']), None)
+        if page_key is None or "missing" in wiki_data['query']['pages'][page_key]:
+            return "Page not found"
+        content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']
+        infobox_pattern = re.compile(r'\{\{Infobox.*?\}\}', re.DOTALL)
+        match = infobox_pattern.search(content)
+        if match:
+            infobox_content = match.group()
+        else:
+            self.infobox_data = {}
+            self.infobox_data_locality = {}
+            return "Infobox not found"
+        if opt is None:
+            self.infobox_data = self.parse_infobox(infobox_content)
+        else:
+            self.infobox_data_locality = self.parse_infobox(infobox_content)
+        # Example usage
+        # for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
+        #     # Get the taxonomic rank from the mapping
+        #     rank = self.property_to_rank.get(prop_id)
+        #     if rank:
+        #         value = claims[0]['mainsnak']['datavalue']['value']
+        #         if isinstance(value, dict):  # If the value is an entity ID
+        #             entity_id = value['id']
+        #             if entity_id not in label_cache:
+        #                 label_cache[entity_id] = self.get_label_for_entity_id(entity_id)
+        #             classification[rank] = label_cache[entity_id]
+        #         else:
+        #             classification[rank] = value
+        # try:
+        #     unknown_link = "https://powo.science.kew.org/taxon/" + classification['POWOID']
+        #     if self.is_valid_url(unknown_link):
+        #         classification['POWOID'] = unknown_link
+        #         classification['POWOID_syn'] = unknown_link + '#synonyms'
+        # except:
+        #     pass
+        # return classification
+    def get_taxonbar_data(self, wiki_page_title):
+        # Step 1: Extract the Wikidata Item ID from the Wikipedia page
+        wiki_api_url = "https://en.wikipedia.org/w/api.php"
+        wiki_params = {
+            "action": "query",
+            "format": "json",
+            "titles": wiki_page_title,
+            "prop": "revisions",
+            "rvprop": "content",
+            "rvslots": "*"
+        }
+        wiki_response = requests.get(wiki_api_url, params=wiki_params)
+        wiki_data = wiki_response.json()
+        page_key = next(iter(wiki_data['query']['pages']))
+        content = wiki_data['query']['pages'][page_key]['revisions'][0]['slots']['main']['*']
+        taxonbar_match = re.search(r'\{\{Taxonbar\|from=(Q\d+)\}\}', content)
+        if not taxonbar_match:
+            return "Taxonbar not found"
+        wikidata_id = taxonbar_match.group(1)
+        # Step 2: Fetch Data from Wikidata Using the Extracted ID
+        wikidata_api_url = "https://www.wikidata.org/w/api.php"
+        wikidata_params = {
+            "action": "wbgetentities",
+            "format": "json",
+            "ids": wikidata_id,
+            "props": "claims"  # Adjust as needed to fetch the desired data
+        }
+        wikidata_response = requests.get(wikidata_api_url, params=wikidata_params)
+        wikidata_content = wikidata_response.json()
+        classification_full = {}
+        classification = {}
+        label_cache = {}  # Cache for labels
+        # Turn this on to see the available properties to decode
+        # for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
+        #     # Assuming the main snak value is what we want
+        #     value = claims[0]['mainsnak']['datavalue']['value']
+        #     if isinstance(value, dict):  # If the value is an entity ID
+        #         # entity_id = value['id']
+        #         # entity_id = value['id']
+        #         if prop_id not in label_cache:
+        #             label_cache[prop_id] = self.get_label_for_entity_id(prop_id)
+        #         classification_full[prop_id] = label_cache[prop_id]
+        #     else:
+        #         classification_full[prop_id] = value
+        # print(classification_full)
+        # Map Wikidata properties to the corresponding taxonomic ranks
+        for prop_id, claims in wikidata_content['entities'][wikidata_id]['claims'].items():
+            # Get the taxonomic rank from the mapping
+            rank = self.property_to_rank.get(prop_id)
+            if rank:
+                value = claims[0]['mainsnak']['datavalue']['value']
+                if isinstance(value, dict):  # If the value is an entity ID
+                    entity_id = value['id']
+                    if entity_id not in label_cache:
+                        label_cache[entity_id] = self.get_label_for_entity_id(entity_id)
+                    classification[rank] = label_cache[entity_id]
+                else:
+                    classification[rank] = value
+        try:
+            unknown_link = "https://powo.science.kew.org/taxon/" + classification['POWOID']
+            if self.is_valid_url(unknown_link):
+                classification['POWOID'] = unknown_link
+                classification['POWOID_syn'] = unknown_link + '#synonyms'
+        except:
+            pass
+        return classification
+    def extract_page_title(self, result_string):
+        first_line = result_string.split('\n')[0]
+        page_title = first_line.replace('Page: ', '').strip()
+        return page_title
+    def get_wikipedia_url(self, page_title):
+        page = self.wiki_wiki.page(page_title)
+        if page.exists():
+            return page.fullurl
+        else:
+            return None
+    def extract_info_taxa(self, page):
+        links = []
+        self.info_packet['WIKI_TAXA']['LINKS'] = {}
+        self.info_packet['WIKI_TAXA']['DATA'] = {}
+        self.info_packet['WIKI_TAXA']['DATA'].update(self.get_taxonbar_data(page.title))
+        for back in page.backlinks:
+            back = self.sanitize(back)
+            if ':' not in back:
+                link = self.sanitize(self.get_wikipedia_url(back))
+                if link not in links:
+                    links.append(link)
+                    self.info_packet['WIKI_TAXA']['LINKS'][back] = link
+    def extract_info_geo(self, page, opt=None):
+        links = []
+        self.info_packet['WIKI_GEO']['LINKS'] = {}
+        if opt is None:
+            self.get_infobox_data(page.title)
+        else:
+            self.get_infobox_data(page.title,opt=opt)
+        for back in itertools.islice(page.backlinks, 10):
+            back = self.sanitize(back)
+            if ':' not in back:
+                link = self.sanitize(self.get_wikipedia_url(back))
+                if link not in links:
+                    links.append(link)
+                    self.info_packet['WIKI_GEO']['LINKS'][back] = link
+    def gather_geo(self, query,opt=None):
+        if opt is None:
+            self.info_packet['WIKI_GEO']['DATA'] = {}
+        else:
+            self.info_packet['WIKI_LOCALITY']['DATA'] = {}
+        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
+        result = wikipedia.run(query)
+        summary = result.split('Summary:')[1]
+        summary = self.sanitize(summary)
+        # print(result)
+        page_title = self.extract_page_title(result)
+        page = self.wiki_wiki.page(page_title)
+        # Do these first, they are less likely to fail
+        if opt is None:
+            self.info_packet['WIKI_GEO']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
+            self.info_packet['WIKI_GEO']['PAGE_TITLE'] = page_title
+            self.info_packet['WIKI_GEO']['SUMMARY'] = summary
+        else:
+            self.info_packet['WIKI_LOCALITY']['PAGE_TITLE'] = page_title
+            self.info_packet['WIKI_LOCALITY']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
+            self.info_packet['WIKI_LOCALITY']['SUMMARY'] = summary
+        # Check if the page exists, get the more complex data. Do it last in case of failure ########################## This might not be useful enough to justify the time
+        # if page.exists():
+        #     if opt is None:
+        #         self.extract_info_geo(page)
+        #     else:
+        #         self.extract_info_geo(page, opt=opt)
+        if opt is None:
+            self.info_packet['WIKI_GEO']['DATA'].update(self.infobox_data)
+        else:
+            self.info_packet['WIKI_LOCALITY']['DATA'].update(self.infobox_data_locality)
+    def gather_taxonomy(self, query):
+        wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
+        # query = "Tracaulon sagittatum Tracaulon sagittatum"
+        result = wikipedia.run(query)
+        summary = result.split('Summary:')[1]
+        summary = self.sanitize(summary)
+        # print(result)
+        page_title = self.extract_page_title(result)
+        page = self.wiki_wiki.page(page_title)
+        # Check if the page exists
+        if page.exists():
+            self.extract_info_taxa(page)
+        self.info_packet['WIKI_TAXA']['PAGE_TITLE'] = page_title
+        self.info_packet['WIKI_TAXA']['PAGE_LINK'] = self.get_wikipedia_url(page_title)
+        self.info_packet['WIKI_TAXA']['SUMMARY'] = summary
+        return self.info_packet
+    def gather_wikipedia_results(self, output):
+        self.info_packet = {}
+        self.info_packet['WIKI_TAXA'] = {}
+        self.info_packet['WIKI_GEO'] = {}
+        self.info_packet['WIKI_LOCALITY'] = {}
+        municipality = output.get('municipality','')
+        county = output.get('county','')
+        stateProvince = output.get('stateProvince','')
+        country = output.get('country','')
+        locality = output.get('locality','')
+        order = output.get('order','')
+        family = output.get('family','')
+        scientificName = output.get('scientificName','')
+        genus = output.get('genus','')
+        specificEpithet = output.get('specificEpithet','')
+        query_geo = ' '.join([municipality, county, stateProvince, country]).strip()
+        query_locality = locality.strip()
+        query_taxa_primary = scientificName.strip()
+        query_taxa_secondary = ' '.join([genus, specificEpithet]).strip()
+        query_taxa_tertiary = ' '.join([order, family, genus, specificEpithet]).strip()
+        # query_taxa = "Tracaulon sagittatum Tracaulon sagittatum"
+        # query_geo = "Indiana Porter Co."
+        # query_locality = "Mical Springs edge"
+        if query_geo:
+            try:
+                self.gather_geo(query_geo)
+            except:
+                pass
+        if query_locality:
+            try:
+                self.gather_geo(query_locality,'locality')
+            except:
+                pass
+        queries_taxa = [query_taxa_primary, query_taxa_secondary, query_taxa_tertiary]
+        for q in queries_taxa:
+            if q:
+                try:
+                    self.gather_taxonomy(q)
+                    break
+                except:
+                    pass
+        # print(self.info_packet)
+        # return self.info_packet
+        # self.gather_geo(query_geo)
+        try:
+            with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
+                json.dump(self.info_packet, file, indent=4)
+        except:
+            sanitized_data = self.sanitize(self.info_packet)
+            with open(self.json_file_path_wiki, 'w', encoding='utf-8') as file:
+                json.dump(sanitized_data, file, indent=4)
+    def sanitize(self, data):
+        if isinstance(data, dict):
+            return {self.sanitize(key): self.sanitize(value) for key, value in data.items()}
+        elif isinstance(data, list):
+            return [self.sanitize(element) for element in data]
+        elif isinstance(data, str):
+            return data.encode('utf-8', 'ignore').decode('utf-8')
+        else:
+            return data
+    def remove_html_and_wiki_markup(self, text):
+        # Remove HTML tags
+        clean_text = re.sub(r'<.*?>', '', text)
+        # Remove Wiki links but keep the text inside
+        # For example, '[[Greg Abbott]]' becomes 'Greg Abbott'
+        clean_text = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', clean_text)
+        # Remove Wiki template markup, e.g., '{{nowrap|text}}' becomes 'text'
+        clean_text = re.sub(r'\{\{(?:[^\}|]*\|)?([^\}|]*)\}\}', r'\1', clean_text)
+        return clean_text
+if __name__ == '__main__':
+    test_output = {
+    "filename": "MICH_7375774_Polygonaceae_Persicaria_",
+    "catalogNumber": "1439649",
+    "order": "",
+    "family": "",
+    "scientificName": "Tracaulon sagittatum",
+    "scientificNameAuthorship": "",
+    "genus": "Tracaulon",
+    "subgenus": "",
+    "specificEpithet": "sagittatum",
+    "infraspecificEpithet": "",
+    "identifiedBy": "",
+    "recordedBy": "Marcus W. Lyon, Jr.",
+    "recordNumber": "TX 11",
+    "verbatimEventDate": "1927",
+    "eventDate": "1927-00-00",
+    "habitat": "wet subdunal woods",
+    "occurrenceRemarks": "Flowers pink",
+    "country": "Indiana",
+    "stateProvince": "Porter Co.",
+    "county": "",
+    "municipality": "",
+    "locality": "Mical Springs edge",
+    "degreeOfEstablishment": "",
+    "decimalLatitude": "",
+    "decimalLongitude": "",
+    "verbatimCoordinates": "",
+    "minimumElevationInMeters": "",
+    "maximumElevationInMeters": ""
+    }
+    Wiki = WikipediaLinks()
+    info_packet= Wiki.gather_wikipedia_results(test_output)

vouchervision/utils_LLM.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # Helper funcs for LLM_XXXXX.py
-import tiktoken, json, os
 from langchain_core.output_parsers.format_instructions import JSON_FORMAT_INSTRUCTIONS
 from transformers import AutoTokenizer
 import GPUtil
@@ -7,6 +7,12 @@ import time
 import psutil
 import threading
 import torch
 def remove_colons_and_double_apostrophes(text):
@@ -45,6 +51,7 @@ class SystemLoadMonitor():
         self.logger = logger
         self.gpu_usage = {'max_cpu_usage': 0, 'max_load': 0, 'max_vram_usage': 0, "max_ram_usage": 0, 'monitoring': True}
         self.start_time = None
         self.has_GPU = torch.cuda.is_available()
         self.monitor_interval = 2
@@ -53,6 +60,12 @@ class SystemLoadMonitor():
         self.monitoring_thread = threading.Thread(target=self.monitor_usage, args=(self.monitor_interval,))
         self.monitoring_thread.start()
     def monitor_usage(self, interval):
         while self.gpu_usage['monitoring']:
             # GPU monitoring
@@ -73,18 +86,49 @@ class SystemLoadMonitor():
             self.gpu_usage['max_cpu_usage'] = max(self.gpu_usage.get('max_cpu_usage', 0), cpu_usage)
             time.sleep(interval)
     def stop_monitoring_report_usage(self):
         self.gpu_usage['monitoring'] = False
         self.monitoring_thread.join()
-        elapsed_time = time.time() - self.start_time
-        self.logger.info(f"Inference Time: {round(elapsed_time,2)} seconds")
         self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
         self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
         if self.has_GPU:
           self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load']*100,2)}%")
           self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'],2)}GB")

 # Helper funcs for LLM_XXXXX.py
+import tiktoken, json, os, yaml
 from langchain_core.output_parsers.format_instructions import JSON_FORMAT_INSTRUCTIONS
 from transformers import AutoTokenizer
 import GPUtil
 import psutil
 import threading
 import torch
+from datetime import datetime
+def save_individual_prompt(prompt_template, txt_file_path_ind_prompt):
+    with open(txt_file_path_ind_prompt, 'w',encoding='utf-8') as file:
+        file.write(prompt_template)
 def remove_colons_and_double_apostrophes(text):
         self.logger = logger
         self.gpu_usage = {'max_cpu_usage': 0, 'max_load': 0, 'max_vram_usage': 0, "max_ram_usage": 0, 'monitoring': True}
         self.start_time = None
+        self.tool_start_time = None
         self.has_GPU = torch.cuda.is_available()
         self.monitor_interval = 2
         self.monitoring_thread = threading.Thread(target=self.monitor_usage, args=(self.monitor_interval,))
         self.monitoring_thread.start()
+    def stop_inference_timer(self):
+        # Stop inference timer and record elapsed time
+        self.inference_time = time.time() - self.start_time
+        # Immediately start the tool timer
+        self.tool_start_time = time.time()
     def monitor_usage(self, interval):
         while self.gpu_usage['monitoring']:
             # GPU monitoring
             self.gpu_usage['max_cpu_usage'] = max(self.gpu_usage.get('max_cpu_usage', 0), cpu_usage)
             time.sleep(interval)
+    def get_current_datetime(self):
+        # Get the current date and time
+        now = datetime.now()
+        # Format it as a string, replacing colons with underscores
+        datetime_iso = now.strftime('%Y_%m_%dT%H_%M_%S')
+        return datetime_iso
     def stop_monitoring_report_usage(self):
+        report = {}
         self.gpu_usage['monitoring'] = False
         self.monitoring_thread.join()
+        # Calculate tool time by checking if tool_start_time is set
+        if self.tool_start_time:
+            tool_time = time.time() - self.tool_start_time
+        else:
+            tool_time = 0
+        report = {'inference_time_s': str(round(self.inference_time,2)),
+                  'tool_time_s': str(round(tool_time, 2)),
+                  'max_cpu': str(round(self.gpu_usage['max_cpu_usage'],2)),
+                  'max_ram_gb': str(round(self.gpu_usage['max_ram_usage'],2)),
+                  'current_time': self.get_current_datetime(),
+        }
+        self.logger.info(f"Inference Time: {round(self.inference_time,2)} seconds")
+        self.logger.info(f"Tool Time: {round(tool_time,2)} seconds")
         self.logger.info(f"Max CPU Usage: {round(self.gpu_usage['max_cpu_usage'],2)}%")
         self.logger.info(f"Max RAM Usage: {round(self.gpu_usage['max_ram_usage'],2)}GB")
         if self.has_GPU:
+          report.update({'max_gpu_load': str(round(self.gpu_usage['max_load']*100,2))})
+          report.update({'max_gpu_vram_gb': str(round(self.gpu_usage['max_vram_usage'],2))})
           self.logger.info(f"Max GPU Load: {round(self.gpu_usage['max_load']*100,2)}%")
           self.logger.info(f"Max GPU Memory Usage: {round(self.gpu_usage['max_vram_usage'],2)}GB")
+        else:
+            report.update({'max_gpu_load': str(0)})
+            report.update({'max_gpu_vram_gb': str(0)})
+        return report

vouchervision/utils_LLM_JSON_validation.py CHANGED Viewed

@@ -14,18 +14,20 @@ def validate_and_align_JSON_keys_with_template(data, JSON_dict_structure):
                 if value.lower() in ['unknown', 'not provided', 'missing', 'na', 'none', 'n/a', 'null',
                                     'not provided in the text', 'not found in the text',
                                     'not in the text', 'not provided', 'not found',
-                                    'not provided in the OCR', 'not found in the OCR',
-                                    'not in the OCR',
-                                    'not provided in the OCR text', 'not found in the OCR text',
                                     "not specified in the given text.",
                                     "not specified in the given text",
                                     "not specified in the text.",
                                     "not specified in the text",
                                     "not specified in text.",
                                     "not specified in text",
-                                    "not specified in OCR",
                                     "not specified",
-                                    'not in the OCR text',
                                     'n/a n/a','n/a, n/a',
                                     'n/a, n/a, n/a','n/a n/a, n/a','n/a, n/a n/a','n/a n/a n/a',
                                     'n/a, n/a, n/a, n/a','n/a n/a n/a n/a','n/a n/a, n/a, n/a','n/a, n/a n/a, n/a','n/a, n/a, n/a n/a',

                 if value.lower() in ['unknown', 'not provided', 'missing', 'na', 'none', 'n/a', 'null',
                                     'not provided in the text', 'not found in the text',
                                     'not in the text', 'not provided', 'not found',
+                                    'not provided in the ocr', 'not found in the ocr',
+                                    'not in the ocr',
+                                    'not provided in the ocr text', 'not found in the ocr text',
                                     "not specified in the given text.",
                                     "not specified in the given text",
                                     "not specified in the text.",
                                     "not specified in the text",
                                     "not specified in text.",
                                     "not specified in text",
+                                    "not specified in ocr",
                                     "not specified",
+                                    'not in the ocr text',
+                                    'Not provided in ocr text',
+                                    'not provided in ocr text',
                                     'n/a n/a','n/a, n/a',
                                     'n/a, n/a, n/a','n/a n/a, n/a','n/a, n/a n/a','n/a n/a n/a',
                                     'n/a, n/a, n/a, n/a','n/a n/a n/a n/a','n/a n/a, n/a, n/a','n/a, n/a n/a, n/a','n/a, n/a, n/a n/a',

vouchervision/utils_VoucherVision.py CHANGED Viewed

@@ -46,6 +46,7 @@ class VoucherVision():
         # self.trOCR_model_version = "microsoft/trocr-large-handwritten"
         self.trOCR_model_version = "microsoft/trocr-base-handwritten"
         self.trOCR_processor = None
         self.trOCR_model = None
@@ -76,10 +77,12 @@ class VoucherVision():
                        "GEO_decimal_long","GEO_city", "GEO_county", "GEO_state",
                        "GEO_state_code", "GEO_country", "GEO_country_code", "GEO_continent",]
         self.wfo_headers = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"]
         self.wfo_headers_no_lists = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_placement"]
-        self.utility_headers = ["filename"] + self.wfo_headers + self.geo_headers + ["tokens_in", "tokens_out", "path_to_crop","path_to_original","path_to_content","path_to_helper",]
                                 # "WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement",
                                 # "GEO_override_OCR", "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat",
@@ -294,7 +297,7 @@ class VoucherVision():
                     break
-    def add_data_to_excel_from_response(self, path_transcription, response, WFO_record, GEO_record, filename_without_extension, path_to_crop, path_to_content, path_to_helper, nt_in, nt_out):
         wb = openpyxl.load_workbook(path_transcription)
@@ -359,6 +362,8 @@ class VoucherVision():
                 sheet.cell(row=next_row, column=i, value=nt_out)
             elif header.value == "filename":
                 sheet.cell(row=next_row, column=i, value=filename_without_extension)
             # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
             elif header.value in self.wfo_headers_no_lists:
@@ -385,6 +390,12 @@ class VoucherVision():
             elif header.value in self.geo_headers:
                 sheet.cell(row=next_row, column=i, value=GEO_record.get(header.value, ''))
         # save the workbook
         wb.save(path_transcription)
@@ -396,7 +407,7 @@ class VoucherVision():
             return False
-    def get_google_credentials(self):
         if self.is_hf:
             creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
             credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
@@ -651,6 +662,9 @@ class VoucherVision():
         name_parts = model_name.split("_")
         self.setup_JSON_dict_structure()
         json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
         json_report.set_JSON({}, {}, {})
@@ -666,7 +680,7 @@ class VoucherVision():
             paths = self.generate_paths(path_to_crop, i)
             self.path_to_crop = path_to_crop
-            filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper = paths
             json_report.set_text(text_main='Starting OCR')
             self.perform_OCR_and_save_results(i, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds)
             json_report.set_text(text_main='Finished OCR')
@@ -685,22 +699,22 @@ class VoucherVision():
                 self.logger.info(f'Waiting for {model_name} API call --- Using {MODEL_NAME_FORMATTED}')
                 if 'PALM2' in name_parts:
-                    response_candidate, nt_in, nt_out, WFO_record, GEO_record = llm_model.call_llm_api_GooglePalm2(prompt, json_report)
                 elif 'GEMINI' in name_parts:
-                    response_candidate, nt_in, nt_out, WFO_record, GEO_record = llm_model.call_llm_api_GoogleGemini(prompt, json_report)
                 elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
-                    response_candidate, nt_in, nt_out, WFO_record, GEO_record = llm_model.call_llm_api_MistralAI(prompt, json_report)
                 elif 'LOCAL' in name_parts:
                     if 'MISTRAL' in name_parts or 'MIXTRAL' in name_parts:
                         if 'CPU' in name_parts:
-                            response_candidate, nt_in, nt_out, WFO_record, GEO_record = llm_model.call_llm_local_cpu_MistralAI(prompt, json_report)
                         else:
-                            response_candidate, nt_in, nt_out, WFO_record, GEO_record = llm_model.call_llm_local_MistralAI(prompt, json_report)
                 else:
-                    response_candidate, nt_in, nt_out, WFO_record, GEO_record = llm_model.call_llm_api_OpenAI(prompt, json_report)
             self.n_failed_LLM_calls += 1 if response_candidate is None else 0
@@ -710,7 +724,7 @@ class VoucherVision():
             self.update_token_counters(nt_in, nt_out)
-            final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, paths, path_to_crop, nt_in, nt_out)
             self.log_completion_info(final_JSON_response)
@@ -779,14 +793,14 @@ class VoucherVision():
         self.total_tokens_out += nt_out
-    def update_final_response(self, response_candidate, WFO_record, GEO_record, paths, path_to_crop, nt_in, nt_out):
-        filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper = paths
         # Saving the JSON and XLSX files with the response and updating the final JSON response
         if response_candidate is not None:
-            final_JSON_response_updated = self.save_json_and_xlsx(response_candidate, WFO_record, GEO_record, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
             return final_JSON_response_updated, WFO_record, GEO_record
         else:
-            final_JSON_response_updated = self.save_json_and_xlsx(response_candidate, WFO_record, GEO_record, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
             return final_JSON_response_updated, WFO_record, GEO_record
@@ -814,13 +828,15 @@ class VoucherVision():
         txt_file_path_OCR = os.path.join(self.Dirs.transcription_ind_OCR, filename_without_extension + '.json')
         txt_file_path_OCR_bounds = os.path.join(self.Dirs.transcription_ind_OCR_bounds, filename_without_extension + '.json')
         jpg_file_path_OCR_helper = os.path.join(self.Dirs.transcription_ind_OCR_helper, filename_without_extension + '.jpg')
         self.logger.info(f'Working on {i+1}/{len(self.img_paths)} --- {filename_without_extension}')
-        return filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper
-    def save_json_and_xlsx(self, response, WFO_record, GEO_record, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out):
         if response is None:
             response = self.JSON_dict_structure
             # Insert 'filename' as the first key
@@ -829,14 +845,14 @@ class VoucherVision():
             # Then add the null info to the spreadsheet
             response_null = self.create_null_row(filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper)
-            self.add_data_to_excel_from_response(self.path_transcription, response_null, WFO_record, GEO_record, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in=0, nt_out=0)
         ### Set completed JSON
         else:
             response = self.clean_catalog_number(response, filename_without_extension)
             self.write_json_to_file(txt_file_path, response)
             # add to the xlsx file
-            self.add_data_to_excel_from_response(self.path_transcription, response, WFO_record, GEO_record, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
         return response

         # self.trOCR_model_version = "microsoft/trocr-large-handwritten"
         self.trOCR_model_version = "microsoft/trocr-base-handwritten"
+        # self.trOCR_model_version = "dh-unibe/trocr-medieval-escriptmask"
         self.trOCR_processor = None
         self.trOCR_model = None
                        "GEO_decimal_long","GEO_city", "GEO_county", "GEO_state",
                        "GEO_state_code", "GEO_country", "GEO_country_code", "GEO_continent",]
+        self.usage_headers = ["current_time", "inference_time_s", "tool_time_s","max_cpu", "max_ram_gb", "max_gpu_load", "max_gpu_vram_gb",]
         self.wfo_headers = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"]
         self.wfo_headers_no_lists = ["WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_placement"]
+        self.utility_headers = ["filename"] + self.wfo_headers + self.geo_headers + self.usage_headers + ["prompt", "LLM", "tokens_in", "tokens_out", "path_to_crop","path_to_original","path_to_content","path_to_helper",]
                                 # "WFO_override_OCR", "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement",
                                 # "GEO_override_OCR", "GEO_method", "GEO_formatted_full_string", "GEO_decimal_lat",
                     break
+    def add_data_to_excel_from_response(self, path_transcription, response, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, path_to_content, path_to_helper, nt_in, nt_out):
         wb = openpyxl.load_workbook(path_transcription)
                 sheet.cell(row=next_row, column=i, value=nt_out)
             elif header.value == "filename":
                 sheet.cell(row=next_row, column=i, value=filename_without_extension)
+            elif header.value == "prompt":
+                sheet.cell(row=next_row, column=i, value=os.path.basename(self.path_custom_prompts))
             # "WFO_exact_match","WFO_exact_match_name","WFO_best_match","WFO_candidate_names","WFO_placement"
             elif header.value in self.wfo_headers_no_lists:
             elif header.value in self.geo_headers:
                 sheet.cell(row=next_row, column=i, value=GEO_record.get(header.value, ''))
+            elif header.value in self.usage_headers:
+                sheet.cell(row=next_row, column=i, value=usage_report.get(header.value, ''))
+            elif header.value == "LLM":
+                sheet.cell(row=next_row, column=i, value=MODEL_NAME_FORMATTED)
         # save the workbook
         wb.save(path_transcription)
             return False
+    def get_google_credentials(self): # Also used for google drive
         if self.is_hf:
             creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
             credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
         name_parts = model_name.split("_")
         self.setup_JSON_dict_structure()
+        Copy_Prompt = PromptCatalog()
+        Copy_Prompt.copy_prompt_template_to_new_dir(self.Dirs.transcription_prompt, self.path_custom_prompts)
         json_report.set_text(text_main=f'Loading {MODEL_NAME_FORMATTED}')
         json_report.set_JSON({}, {}, {})
             paths = self.generate_paths(path_to_crop, i)
             self.path_to_crop = path_to_crop
+            filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt = paths
             json_report.set_text(text_main='Starting OCR')
             self.perform_OCR_and_save_results(i, jpg_file_path_OCR_helper, txt_file_path_OCR, txt_file_path_OCR_bounds)
             json_report.set_text(text_main='Finished OCR')
                 self.logger.info(f'Waiting for {model_name} API call --- Using {MODEL_NAME_FORMATTED}')
                 if 'PALM2' in name_parts:
+                    response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_GooglePalm2(prompt, json_report, paths)
                 elif 'GEMINI' in name_parts:
+                    response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_GoogleGemini(prompt, json_report, paths)
                 elif 'MISTRAL' in name_parts and ('LOCAL' not in name_parts):
+                    response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_MistralAI(prompt, json_report, paths)
                 elif 'LOCAL' in name_parts:
                     if 'MISTRAL' in name_parts or 'MIXTRAL' in name_parts:
                         if 'CPU' in name_parts:
+                            response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_local_cpu_MistralAI(prompt, json_report, paths)
                         else:
+                            response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_local_MistralAI(prompt, json_report, paths)
                 else:
+                    response_candidate, nt_in, nt_out, WFO_record, GEO_record, usage_report = llm_model.call_llm_api_OpenAI(prompt, json_report, paths)
             self.n_failed_LLM_calls += 1 if response_candidate is None else 0
             self.update_token_counters(nt_in, nt_out)
+            final_JSON_response, final_WFO_record, final_GEO_record = self.update_final_response(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out)
             self.log_completion_info(final_JSON_response)
         self.total_tokens_out += nt_out
+    def update_final_response(self, response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, paths, path_to_crop, nt_in, nt_out):
+        filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt = paths
         # Saving the JSON and XLSX files with the response and updating the final JSON response
         if response_candidate is not None:
+            final_JSON_response_updated = self.save_json_and_xlsx(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
             return final_JSON_response_updated, WFO_record, GEO_record
         else:
+            final_JSON_response_updated = self.save_json_and_xlsx(response_candidate, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
             return final_JSON_response_updated, WFO_record, GEO_record
         txt_file_path_OCR = os.path.join(self.Dirs.transcription_ind_OCR, filename_without_extension + '.json')
         txt_file_path_OCR_bounds = os.path.join(self.Dirs.transcription_ind_OCR_bounds, filename_without_extension + '.json')
         jpg_file_path_OCR_helper = os.path.join(self.Dirs.transcription_ind_OCR_helper, filename_without_extension + '.jpg')
+        json_file_path_wiki = os.path.join(self.Dirs.transcription_ind_wiki, filename_without_extension + '.json')
+        txt_file_path_ind_prompt = os.path.join(self.Dirs.transcription_ind_prompt, filename_without_extension + '.txt')
         self.logger.info(f'Working on {i+1}/{len(self.img_paths)} --- {filename_without_extension}')
+        return filename_without_extension, txt_file_path, txt_file_path_OCR, txt_file_path_OCR_bounds, jpg_file_path_OCR_helper, json_file_path_wiki, txt_file_path_ind_prompt
+    def save_json_and_xlsx(self, response, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out):
         if response is None:
             response = self.JSON_dict_structure
             # Insert 'filename' as the first key
             # Then add the null info to the spreadsheet
             response_null = self.create_null_row(filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper)
+            self.add_data_to_excel_from_response(self.path_transcription, response_null, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in=0, nt_out=0)
         ### Set completed JSON
         else:
             response = self.clean_catalog_number(response, filename_without_extension)
             self.write_json_to_file(txt_file_path, response)
             # add to the xlsx file
+            self.add_data_to_excel_from_response(self.path_transcription, response, WFO_record, GEO_record, usage_report, MODEL_NAME_FORMATTED, filename_without_extension, path_to_crop, txt_file_path, jpg_file_path_OCR_helper, nt_in, nt_out)
         return response

vouchervision/utils_hf.py CHANGED Viewed

@@ -99,42 +99,60 @@ def check_prompt_yaml_filename(fname):
         return False
 # Function to upload files to Google Drive
-def upload_to_drive(filepath, filename):
-    # Parse the service account info from the environment variable
-    creds_info = os.environ.get('GDRIVE_API')
-    if creds_info:
-        creds_info = json.loads(creds_info)
-        creds = service_account.Credentials.from_service_account_info(
-            creds_info, scopes=["https://www.googleapis.com/auth/drive"]
-        )
-        service = build('drive', 'v3', credentials=creds)
-        # Get the folder ID from the environment variable
-        folder_id = os.environ.get('GDRIVE_FOLDER_ID')  # Renamed for clarity
-        if folder_id:
-            file_metadata = {
-                'name': filename,
-                'parents': [folder_id]
-            }
-            # Determine the mimetype based on the file extension
-            if filename.endswith('.yaml') or filename.endswith('.yml'):
-                mimetype = 'application/x-yaml'
-            elif filename.endswith('.zip'):
-                mimetype = 'application/zip'
             else:
-                # Set a default mimetype if desired or handle the unsupported file type
-                print("Unsupported file type")
-                return None
-            # Upload the file
-            media = MediaFileUpload(filepath, mimetype=mimetype)
-            file = service.files().create(
-                body=file_metadata,
-                media_body=media,
-                fields='id'
-            ).execute()
-            print(f"Uploaded file with ID: {file.get('id')}")
     else:
-        print("GDRIVE_API environment variable not set.")

         return False
 # Function to upload files to Google Drive
+def upload_to_drive(filepath, filename, is_hf=True, cfg_private=None, do_upload = True):
+    if do_upload:
+        creds = get_google_credentials(is_hf=is_hf, cfg_private=cfg_private)
+        if creds:
+            service = build('drive', 'v3', credentials=creds)
+            # Get the folder ID from the environment variable
+            if is_hf:
+                folder_id = os.environ.get('GDRIVE_FOLDER_ID')  # Renamed for clarity
             else:
+                folder_id = cfg_private['google']['GDRIVE_FOLDER_ID']  # Renamed for clarity
+            if folder_id:
+                file_metadata = {
+                    'name': filename,
+                    'parents': [folder_id]
+                }
+                # Determine the mimetype based on the file extension
+                if filename.endswith('.yaml') or filename.endswith('.yml') or filepath.endswith('.yaml') or filepath.endswith('.yml'):
+                    mimetype = 'application/x-yaml'
+                elif filepath.endswith('.zip'):
+                    mimetype = 'application/zip'
+                else:
+                    # Set a default mimetype if desired or handle the unsupported file type
+                    print("Unsupported file type")
+                    return None
+                # Upload the file
+                try:
+                    media = MediaFileUpload(filepath, mimetype=mimetype)
+                    file = service.files().create(
+                        body=file_metadata,
+                        media_body=media,
+                        fields='id'
+                    ).execute()
+                    print(f"Uploaded file with ID: {file.get('id')}")
+                except Exception as e:
+                    msg = f"If the following error is '404 cannot find file...' then you need to share the GDRIVE folder with your Google API service account's email address. Open your Google API JSON file, find the email account that ends with '@developer.gserviceaccount.com', go to your Google Drive, share the folder with this email account. {e}"
+                    print(msg)
+                    raise Exception(msg)
+        else:
+            print("GDRIVE_API environment variable not set.")
+def get_google_credentials(is_hf=True, cfg_private=None): # Also used for google drive
+    if is_hf:
+        creds_json_str = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+        credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
+        return credentials
     else:
+        with open(cfg_private['google']['GOOGLE_APPLICATION_CREDENTIALS'], 'r') as file:
+            data = json.load(file)
+            creds_json_str = json.dumps(data)
+            credentials = service_account.Credentials.from_service_account_info(json.loads(creds_json_str))
+            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = creds_json_str
+            return credentials

vouchervision/vouchervision_main.py CHANGED Viewed

@@ -79,20 +79,28 @@ def voucher_vision(cfg_file_path, dir_home, path_custom_prompts, cfg_test, progr
     Voucher_Vision.close_logger_handlers()
     zip_filepath = None
-    if is_hf:
-        # Create Higging Face zip file
-        dir_to_zip = os.path.join(Dirs.dir_home, Dirs.run_name)
-        zip_filename = Dirs.run_name
-        # Creating a zip file
-        zip_filepath = make_zipfile(dir_to_zip, zip_filename)
-        upload_to_drive(zip_filepath, zip_filename)
     return last_JSON_response, final_WFO_record, final_GEO_record, total_cost, Voucher_Vision.n_failed_OCR, Voucher_Vision.n_failed_LLM_calls, zip_filepath
-def make_zipfile(source_dir, output_filename):
-    shutil.make_archive(output_filename, 'zip', source_dir)
-    return output_filename + '.zip'
 def voucher_vision_OCR_test(cfg_file_path, dir_home, cfg_test, path_to_crop):
     # get_n_overall = progress_report.get_n_overall()

     Voucher_Vision.close_logger_handlers()
     zip_filepath = None
+    # Create Higging Face zip file
+    dir_to_zip = os.path.join(Dirs.dir_home, Dirs.run_name)
+    zip_filename = Dirs.run_name
+    # Creating a zip file
+    zip_filepath = make_zipfile(dir_to_zip, zip_filename) ####################################################################################################### TODO Make this configurable
+    if is_hf:
+        upload_to_drive(zip_filepath, zip_filename, is_hf, cfg_private=Voucher_Vision.cfg_private, do_upload=True) ###################################### TODO Make this configurable
+    else:
+        upload_to_drive(zip_filepath, zip_filename, is_hf, cfg_private=Voucher_Vision.cfg_private, do_upload=False) ##################################### TODO Make this configurable
     return last_JSON_response, final_WFO_record, final_GEO_record, total_cost, Voucher_Vision.n_failed_OCR, Voucher_Vision.n_failed_LLM_calls, zip_filepath
+def make_zipfile(base_dir, output_filename):
+    # Determine the directory where the zip file should be saved
+    # Construct the full path for the zip file
+    full_output_path = os.path.join(base_dir, output_filename)
+    # Create the zip archive
+    shutil.make_archive(full_output_path, 'zip', base_dir)
+    # Return the full path of the created zip file
+    return os.path.join(base_dir, output_filename + '.zip')
 def voucher_vision_OCR_test(cfg_file_path, dir_home, cfg_test, path_to_crop):
     # get_n_overall = progress_report.get_n_overall()