Spaces:

jrc-ai
/

MultiNER-simplified

Running

App Files Files Community

Consoli Sergio commited on Feb 21

Commit

66b8c66

1 Parent(s): eff93c6

corrected bug on double repetition on history

Browse files

Files changed (4) hide show

app-demo-myMultiNER.py +103 -43
nerBio.py +263 -118
retrieverRAG_SF.py +114 -0
virtuosoQueryRest.py +4 -0

app-demo-myMultiNER.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
-# os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
-#
-# os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
-# os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
-# os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
 print(file_utils.default_cache_path)
@@ -76,8 +76,8 @@ examples = [
-#models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
-models_List = ["Babelscape/wikineural-multilingual-ner",  "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
 #models_List = ["NCBO/BioPortal" ]
 #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
@@ -189,7 +189,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
         state = {
             "text": "",
             "df_annotated_dict": dict(),
-            "KGchoices": KGchoices
         }
         return {"text": text, "entities": []}, html_output, state, [], ""
@@ -224,7 +229,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                         help="List of ontologies to which restrict the entity linking task.")
     #consose 20250502:
     if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
-        parser.add_argument("--USE_CACHE", type=str, default="False",
                             help="whether to use cache for the NER and NEL tasks or not")
     else:
         #print("Lists do not have the same elements")
@@ -237,6 +242,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                         help="whether to extract a readable context from the extracted triples for the concept")
     parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
                         help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
     parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
                         help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
@@ -257,7 +264,39 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
     if state:
         previous_text = state.get("text", "")
         previous_df_annotated_dict = state.get("df_annotated_dict", {})
         previous_kg_choices = state.get("KGchoices", [])
     #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
     #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
@@ -319,7 +358,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
         state = {
             "text": text,
             "df_annotated_dict": df_annotated.to_dict(),
-            "KGchoices": KGchoices
         }
     else:
@@ -341,7 +385,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
         state = {
             "text": text,
             "df_annotated_dict": df_annotated.to_dict(),
-            "KGchoices": KGchoices
         }
@@ -353,6 +402,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
             if df_annotated.empty and quoted_text==False:
                 html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                 return {"text": text, "entities": []}, html_output, state, [], ""
         df_annotated_combined = pd.DataFrame()
@@ -360,6 +410,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             df_annotated_combined = entitiesFusion(df_annotated,args)
             if df_annotated_combined.empty and quoted_text==False:
                 html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                 return {"text": text, "entities": []}, html_output, state, [], ""
             else:
                 if (not df_annotated.empty):
@@ -530,6 +581,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
         df_annotated_combined = df_annotated_combined[filter_mask]
         if df_annotated_combined.empty:
             html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
             return {"text": text, "entities": []}, html_output, state, [], ""
         ###
@@ -540,6 +592,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
             if df_annotated_combined.empty:
                 html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
                 return {"text": text, "entities": []}, html_output, state, [], ""
         dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
@@ -550,15 +603,15 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
             # # Create a new column for the entities with links
             df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
-                #lambda row: (
-                #    f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
-                #        if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
-                #        'word']
-                #),
                 lambda row: (
-                    f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
-                    if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
-                        'word']
                 ),
                 axis=1
             )
@@ -641,17 +694,20 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
                 words_for_dropdown = []
             html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
             #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
             return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
         else:
             html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
             return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
     else:
         html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
         return {"text": text, "entities": []}, html_output, state, [], ""
@@ -663,28 +719,32 @@ def update_urls(selected_word, state):
     # Convert the state dictionary back into a DataFrame
         df = pd.DataFrame(state["df_annotated_combined_dict"])
-        # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
-        # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
-        # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
-        valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != []  and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
-        # Check if the selected word is in the filtered DataFrame
-        if selected_word in valid_entries['word'].values:
-            urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
-            if 'namedEntity' in df.columns:
-                firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
-                firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
-                if firsturlinlist and firsturlinlist in urls:
-                    # Remove the URL from its current position
-                    urls.remove(firsturlinlist)
-                    # Insert the URL at the first position
-                    urls.insert(0, firsturlinlist)
-            # Convert list of URLs to HTML string with clickable links
-            #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
-            html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
-            return html_links
-        return ""
     else:
         return ""
@@ -768,7 +828,7 @@ with gr.Blocks(title="BioAnnotator") as demo:
             text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
             models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
             categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
-            score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7)
             nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
             kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
             state = gr.State(value={})
@@ -824,4 +884,4 @@ with gr.Blocks(title="BioAnnotator") as demo:
 demo.launch()
-#demo.launch(share=True)  # Share your demo with just 1 extra parameter

 import os
+#os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
+#os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+#os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+#os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
 from transformers import file_utils
 print(file_utils.default_cache_path)
+models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",  "Babelscape/wikineural-multilingual-ner",  "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
+#models_List = ["Babelscape/wikineural-multilingual-ner",  "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1",  "knowledgator/gliner-multitask-large-v0.5"
 #models_List = ["NCBO/BioPortal" ]
 #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
         state = {
             "text": "",
             "df_annotated_dict": dict(),
+            "df_annotated_combined_dict": dict(),
+            "KGchoices": KGchoices,
+            "ModelsSelection": ModelsSelection,
+            "ScoreFilt": ScoreFilt,
+            "EntityLinking": EntityLinking,
+            "html_output": html_output
         }
         return {"text": text, "entities": []}, html_output, state, [], ""
                         help="List of ontologies to which restrict the entity linking task.")
     #consose 20250502:
     if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
+        parser.add_argument("--USE_CACHE", type=str, default="True",
                             help="whether to use cache for the NER and NEL tasks or not")
     else:
         #print("Lists do not have the same elements")
                         help="whether to extract a readable context from the extracted triples for the concept")
     parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
                         help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
+    parser.add_argument("--maxTriplesGlobalContext", type=int, default=20000,
+                        help="maximum number of triples to consider for global context computation")  # if 0 or None it is not considered
     parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
                         help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
     if state:
         previous_text = state.get("text", "")
         previous_df_annotated_dict = state.get("df_annotated_dict", {})
+        previous_df_annotated_combined_dict = state.get("df_annotated_combined_dict", {})
         previous_kg_choices = state.get("KGchoices", [])
+        previous_ModelsSelection = state.get("ModelsSelection", [])
+        previous_ScoreFilt_from_state = float(state.get("ScoreFilt", ScoreFilt))  # Ensure ScoreFilt is a float
+        previous_EntityLinking_from_state = bool(state.get("EntityLinking", EntityLinking))  # Ensure EntityLinking is a boolean
+        previous_html_output = state.get("html_output", "")
+        if  previous_html_output and (previous_df_annotated_dict) and (previous_df_annotated_combined_dict) and (previous_text == text) and (sorted(previous_kg_choices) == sorted(KGchoices)) and (sorted(previous_ModelsSelection) == sorted(ModelsSelection)) and (previous_ScoreFilt_from_state == ScoreFilt) and (previous_EntityLinking_from_state == EntityLinking):
+            ddf_annot_prev = pd.DataFrame(previous_df_annotated_combined_dict)
+            if 'ALLURIScontext' in ddf_annot_prev.columns:
+                # words_for_dropdown = df_annotated_combined[
+                #     df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
+                #     'word'].unique().tolist()
+                words_for_dropdown = ddf_annot_prev[ddf_annot_prev['ALLURIScontext'].apply(
+                    lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (
+                                isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip()))))][
+                    'word'].unique().tolist()
+                words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
+                words_for_dropdown.insert(0, "")
+            else:
+                words_for_dropdown = []
+            dict_annotated_combined_NER = ddf_annot_prev[
+                ["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
+            # return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
+            return {"text": text, "entities": dict_annotated_combined_NER}, previous_html_output, state, gr.update(
+                choices=words_for_dropdown), ""
     #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
     #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
         state = {
             "text": text,
             "df_annotated_dict": df_annotated.to_dict(),
+            "df_annotated_combined_dict": dict(),
+            "KGchoices": KGchoices,
+            "ModelsSelection": ModelsSelection,
+            "ScoreFilt": ScoreFilt,
+            "EntityLinking": EntityLinking,
+            "html_output": ""
         }
     else:
         state = {
             "text": text,
             "df_annotated_dict": df_annotated.to_dict(),
+            "df_annotated_combined_dict": dict(),
+            "KGchoices": KGchoices,
+            "ModelsSelection": ModelsSelection,
+            "ScoreFilt": ScoreFilt,
+            "EntityLinking": EntityLinking,
+            "html_output": ""
         }
             df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
             if df_annotated.empty and quoted_text==False:
                 html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+                state["html_output"] = html_output
                 return {"text": text, "entities": []}, html_output, state, [], ""
         df_annotated_combined = pd.DataFrame()
             df_annotated_combined = entitiesFusion(df_annotated,args)
             if df_annotated_combined.empty and quoted_text==False:
                 html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+                state["html_output"] = html_output
                 return {"text": text, "entities": []}, html_output, state, [], ""
             else:
                 if (not df_annotated.empty):
         df_annotated_combined = df_annotated_combined[filter_mask]
         if df_annotated_combined.empty:
             html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+            state["html_output"] = html_output
             return {"text": text, "entities": []}, html_output, state, [], ""
         ###
             df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
             if df_annotated_combined.empty:
                 html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+                state["html_output"] = html_output
                 return {"text": text, "entities": []}, html_output, state, [], ""
         dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
             # # Create a new column for the entities with links
             df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
+                # lambda row: (
+                #     f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
+                #     if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
+                #         'word']
+                # ),
                 lambda row: (
+                   f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
+                   if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
+                       'word']
                 ),
                 axis=1
             )
                 words_for_dropdown = []
             html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
+            state["html_output"] = html_output
             #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
             return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
         else:
             html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+            state["html_output"] = html_output
             return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
     else:
         html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
+        state["html_output"] = html_output
         return {"text": text, "entities": []}, html_output, state, [], ""
     # Convert the state dictionary back into a DataFrame
         df = pd.DataFrame(state["df_annotated_combined_dict"])
+        if 'ALLURIScontext' in df.columns:
+            # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
+            # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
+            # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
+            valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != []  and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
+            # Check if the selected word is in the filtered DataFrame
+            if selected_word in valid_entries['word'].values:
+                urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
+                if 'namedEntity' in df.columns:
+                    firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
+                    firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
+                    if firsturlinlist and firsturlinlist in urls:
+                        # Remove the URL from its current position
+                        urls.remove(firsturlinlist)
+                        # Insert the URL at the first position
+                        urls.insert(0, firsturlinlist)
+                # Convert list of URLs to HTML string with clickable links
+                #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
+                html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
+                return html_links
+            return ""
+        else:
+            return""
     else:
         return ""
             text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
             models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
             categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
+            score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.05, label="Score", value=0.75)
             nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
             kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
             state = gr.State(value={})
 demo.launch()
+#demo.launch(share=True)  # Share your demo with just 1 extra parameter

nerBio.py CHANGED Viewed

@@ -65,7 +65,8 @@ import json
 import random
 import numpy as np
-from retrieverRAG_testing import RAG_retrieval_Base, RAG_retrieval_Z_scores, RAG_retrieval_Percentile, RAG_retrieval_TopK
 from joblib import Memory
@@ -957,135 +958,265 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
     word = word.lower()
     word = word.capitalize()
-    if (strtobool(args.UseRetrieverForContextCreation)==True):
         labelTriples = ""
-        passages = []
-        nn = 200
-        if len(labelTriplesLIST)<=nn:
             passages = []
             for i, triple in enumerate(labelTriplesLIST, start=1):
                 # for triple in labelTriplesLIST:
                 TriplesString = (" ".join(str(element).capitalize() for element in triple))
                 passages.append(TriplesString)
-            df_retrieved = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
-                                                  min_threshold=0.7)
-            if not df_retrieved.empty:
-                #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
-                labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
-                labelTriplesAPP = ". ".join(
-                    " ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
-                if not labelTriples:
-                    labelTriples = labelTriplesAPP
-                else:
-                    labelTriples = labelTriples + ". " + labelTriplesAPP
-        else:
-            OverallListRAGtriples = labelTriplesLIST.copy()
-            while len(OverallListRAGtriples)>nn:
-                Oinnerlistiterative=[]
-                for i, triple in enumerate(OverallListRAGtriples, start=1):
-                    # for triple in labelTriplesLIST:
-                    TriplesString = (" ".join(str(element).capitalize() for element in triple))
-                    passages.append(TriplesString)
-                    # Check if the current index is a multiple of nn
-                    if i % nn == 0:
-                        # print("elaborate RAG triples")
-                        # df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
-                        # df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0,  max_num_passages=20, min_threshold=0.7)
-                        # df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90,  max_num_passages=20, min_threshold=0.7)
-                        df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
-                                                              min_threshold=0.7)
-                        passages = []
-                        df_retrieved = df_retrievedtopk.copy()
-                        if not df_retrieved.empty:
-                            #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
-                            labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
-                            if not Oinnerlistiterative:
-                                Oinnerlistiterative=labelTriplesLIST_RAGGED
-                            else:
-                                Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
-                if passages:
-                    df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
-                                                          min_threshold=0.7)
-                    df_retrieved = df_retrievedtopk.copy()
-                    if not df_retrieved.empty:
-                        #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
-                        labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
-                        if not Oinnerlistiterative:
-                            Oinnerlistiterative = labelTriplesLIST_RAGGED
-                        else:
-                            Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
-                OverallListRAGtriples = Oinnerlistiterative.copy()
-            if OverallListRAGtriples:
-                labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in OverallListRAGtriples)
-                if not labelTriples:
-                    labelTriples = labelTriplesAPP
                 else:
-                    labelTriples = labelTriples + ". " + labelTriplesAPP
-                labelTriples = labelTriples.strip().replace("..", ".").strip()
-        # labelTriples = ""
-        # passages = []
-        # nn=200
-        # for i, triple in enumerate(labelTriplesLIST, start=1):
-        # #for triple in labelTriplesLIST:
-        #     TriplesString = (" ".join(str(element).capitalize() for element in triple))
-        #     passages.append(TriplesString)
-        #     # Check if the current index is a multiple of nn
-        #     if i % nn == 0:
-        #         #print("elaborate RAG triples")
-        #
-        #         #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
-        #         #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0,  max_num_passages=20, min_threshold=0.7)
-        #         #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90,  max_num_passages=20, min_threshold=0.7)
-        #         df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
-        #
-        #         passages = []
-        #
-        #         df_retrieved = df_retrievedtopk.copy()
-        #         if not df_retrieved.empty:
-        #             #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
-        #             labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
-        #             labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
-        #
-        #             if not labelTriples:
-        #                 labelTriples =labelTriplesAPP
-        #             else:
-        #                 labelTriples = labelTriples + ". " + labelTriplesAPP
-        #
-        # if passages:
-        #     df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
-        #
-        #     df_retrieved = df_retrievedtopk.copy()
-        #     if not df_retrieved.empty:
-        #         #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
-        #           labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
-        #         labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
-        #         if not labelTriples:
-        #             labelTriples = labelTriplesAPP
-        #         else:
-        #             labelTriples = labelTriples + ". " + labelTriplesAPP
-        #
-        # if labelTriples:
-        #     labelTriples.strip().replace("..",".").strip()
     else:  # NO RAG on triples
@@ -1571,7 +1702,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
     if entityBioeUrl:
-        if strtobool(args.computeEntityContext):
             if strtobool(args.debug):
                 print("START computeEntityContext")
@@ -1706,6 +1837,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
             if not globalContext:
                 if unique_listGlobalTriples:
                     globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples,
                                                                                                     text_splitter, args,
@@ -1750,6 +1883,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
                             return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
                     if not ALLURIScontext:
                         # Print the error message to stderr
                         print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
@@ -1879,9 +2013,16 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
                             if unique_listLabelTriples:
                                 unique_listGlobalTriples.extend(unique_listLabelTriples)
-                                # If I want to speed up, I can break here, but in this case I will not store the triples for the other uris in the cache, which maybe useful in the future
-                                # if token_counter(str(unique_listGlobalTriples),args.model_name) > args.tokens_max:
-                                #     break # BREAK THE FOR LOOP IF THE GLOBAL CONTEXT IS ALREADY TOO BIG, BIGGER THAN tokens_max
                     if unique_listGlobalTriples:
@@ -1892,7 +2033,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
                         if cache_map_virtuoso is not None:
                             if not word in cache_map_virtuoso:
                                 cache_map_virtuoso[word] = {}
-                            cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()] = unique_listGlobalTriples
                         globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word,
                                                                                                         unique_listGlobalTriples,
@@ -1903,7 +2045,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
                     if cache_map_virtuoso is not None:
                         if not word in cache_map_virtuoso:
                             cache_map_virtuoso[word] = {}
-                        cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()] = globalContext
     if unique_listLabelTriples:
         sssingleTriples = " ,., ".join(
@@ -2291,6 +2434,8 @@ if __name__ == '__main__':
         parser.add_argument("--computeEntityContext", type=str, default="False", help="whether to extract a readable context from the extracted triples for the concept")
         parser.add_argument("--computeEntityGlobalContext", type=str, default="False", help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
         parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
                             help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")

 import random
 import numpy as np
+#from retrieverRAG_testing import RAG_retrieval_Base, RAG_retrieval_Z_scores, RAG_retrieval_Percentile, RAG_retrieval_TopK, retrievePassageSimilarities
+from retrieverRAG_SF import RAG_retrieval_Base
 from joblib import Memory
     word = word.lower()
     word = word.capitalize()
+    labelTriples=""
+    if labelTriplesLIST and getattr(args, 'maxTriplesContextComputation', None):  # it means it exists
+        if args.maxTriplesContextComputation > 0:
+            if len(labelTriplesLIST) > args.maxTriplesContextComputation:
+                labelTriplesLIST = labelTriplesLIST[:args.maxTriplesContextComputation]
+    if (strtobool(args.UseRetrieverForContextCreation) == True):
+        # if strtobool(args.debug):
+        #     print("Start reranking - num passages : ", len(labelTriplesLIST), "\n")
+        #     startRerank = time.time()
+        #
+        # labelTriples = ""
+        # passages = []
+        # nn = 200
+        #
+        # OverallListRAGtriples = []
+        # labelTriplesLIST_RAGGED = []
+        #
+        # if len(labelTriplesLIST) <= nn:
+        #     passages = []
+        #     for i, triple in enumerate(labelTriplesLIST, start=1):
+        #         # for triple in labelTriplesLIST:
+        #         TriplesString = (" ".join(str(element).capitalize() for element in triple))
+        #         passages.append(TriplesString)
+        #
+        #     df_retrieved = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
+        #                                       min_threshold=0.7)
+        #
+        #     if not df_retrieved.empty:
+        #         # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
+        #         labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
+        #         labelTriplesAPP = ". ".join(
+        #             " ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
+        #
+        #         if not labelTriples:
+        #             labelTriples = labelTriplesAPP
+        #         else:
+        #             labelTriples = labelTriples + ". " + labelTriplesAPP
+        #
+        # else:
+        #
+        #     OverallListRAGtriples = labelTriplesLIST.copy()
+        #
+        #     while len(OverallListRAGtriples) > nn:
+        #         Oinnerlistiterative = []
+        #         for i, triple in enumerate(OverallListRAGtriples, start=1):
+        #             # for triple in labelTriplesLIST:
+        #             TriplesString = (" ".join(str(element).capitalize() for element in triple))
+        #             passages.append(TriplesString)
+        #             # Check if the current index is a multiple of nn
+        #             if i % nn == 0:
+        #                 # print("elaborate RAG triples")
+        #
+        #                 # df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
+        #                 # df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0,  max_num_passages=20, min_threshold=0.7)
+        #                 # df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90,  max_num_passages=20, min_threshold=0.7)
+        #                 df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1,
+        #                                                       max_num_passages=20,
+        #                                                       min_threshold=0.7)
+        #
+        #                 passages = []
+        #
+        #                 df_retrieved = df_retrievedtopk.copy()
+        #                 if not df_retrieved.empty:
+        #                     # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
+        #                     labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
+        #                     if not Oinnerlistiterative:
+        #                         Oinnerlistiterative = labelTriplesLIST_RAGGED
+        #                     else:
+        #                         Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
+        #
+        #         if passages:
+        #             df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
+        #                                                   min_threshold=0.7)
+        #
+        #             df_retrieved = df_retrievedtopk.copy()
+        #             if not df_retrieved.empty:
+        #                 # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
+        #                 labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
+        #                 if not Oinnerlistiterative:
+        #                     Oinnerlistiterative = labelTriplesLIST_RAGGED
+        #                 else:
+        #                     Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
+        #
+        #         OverallListRAGtriples = Oinnerlistiterative.copy()
+        #
+        #     if OverallListRAGtriples:
+        #         labelTriplesAPP = ". ".join(
+        #             " ".join(str(element).capitalize() for element in triple) for triple in OverallListRAGtriples)
+        #
+        #         if not labelTriples:
+        #             labelTriples = labelTriplesAPP
+        #         else:
+        #             labelTriples = labelTriples + ". " + labelTriplesAPP
+        #
+        #         labelTriples = labelTriples.strip().replace("..", ".").strip()
+        #
+        # # labelTriples = ""
+        # # passages = []
+        # # nn=200
+        # # for i, triple in enumerate(labelTriplesLIST, start=1):
+        # # #for triple in labelTriplesLIST:
+        # #     TriplesString = (" ".join(str(element).capitalize() for element in triple))
+        # #     passages.append(TriplesString)
+        # #     # Check if the current index is a multiple of nn
+        # #     if i % nn == 0:
+        # #         #print("elaborate RAG triples")
+        # #
+        # #         #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
+        # #         #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0,  max_num_passages=20, min_threshold=0.7)
+        # #         #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90,  max_num_passages=20, min_threshold=0.7)
+        # #         df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
+        # #
+        # #         passages = []
+        # #
+        # #         df_retrieved = df_retrievedtopk.copy()
+        # #         if not df_retrieved.empty:
+        # #             #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
+        # #             labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
+        # #             labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
+        # #
+        # #             if not labelTriples:
+        # #                 labelTriples =labelTriplesAPP
+        # #             else:
+        # #                 labelTriples = labelTriples + ". " + labelTriplesAPP
+        # #
+        # # if passages:
+        # #     df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
+        # #
+        # #     df_retrieved = df_retrievedtopk.copy()
+        # #     if not df_retrieved.empty:
+        # #         #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
+        # #           labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
+        # #         labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
+        # #         if not labelTriples:
+        # #             labelTriples = labelTriplesAPP
+        # #         else:
+        # #             labelTriples = labelTriples + ". " + labelTriplesAPP
+        # #
+        # # if labelTriples:
+        # #     labelTriples.strip().replace("..",".").strip()
+        #
+        # if strtobool(args.debug):
+        #     numfinal = 0
+        #     if OverallListRAGtriples:
+        #         numfinal = len(OverallListRAGtriples)
+        #     elif labelTriplesLIST_RAGGED:
+        #         numfinal = len(labelTriplesLIST_RAGGED)
+        #     print("End reranking - found final passages : ", numfinal, "\n")
+        #     #
+        #     endRerank = time.time()
+        #     hours, rem = divmod(endRerank - startRerank, 3600)
+        #     minutes, seconds = divmod(rem, 60)
+        #     print("Rerank Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds))
+        #     #
+        # if len(labelTriplesLIST) > 10000:
+        #      print("debug")
+        if strtobool(args.debug):
+            print("Start reranking2 - num passages : ", len(labelTriplesLIST), "\n")
+            startRerank2 = time.time()
         labelTriples = ""
+        try:
             passages = []
             for i, triple in enumerate(labelTriplesLIST, start=1):
                 # for triple in labelTriplesLIST:
                 TriplesString = (" ".join(str(element).capitalize() for element in triple))
                 passages.append(TriplesString)
+            nback = 1
+            if len(passages) <= 10:
+                nback = len(passages)
+            elif len(passages) <= 1000:
+                nback = 10+int(0.1 * len(passages))  # 10% of the number of passages
+            elif len(passages) <= 5000:
+                nback = 200
+            elif len(passages) <= 10000:
+                nback = 300
+            else:
+                nback = 400
+            df_retrieved = RAG_retrieval_Base(questionText, passages, min_threshold=0, max_num_passages=nback)
+            if not df_retrieved.empty:
+                countRetr = 0
+                min_threshold = 0.80
+                countRetr = (df_retrieved['score'] > min_threshold).sum()
+                countRetrThreshold = int(nback / 2)
+                if nback > 10:
+                    countRetrThreshold = 10
                 else:
+                    countRetrThreshold = int(nback/2)
+                if countRetrThreshold <=0:
+                    countRetrThreshold = 1
+                while countRetr <= countRetrThreshold:
+                    min_threshold = min_threshold - 0.05
+                    countRetr = (df_retrieved['score'] >= min_threshold).sum()
+                    if min_threshold < 0.2:
+                        break
+                # countRetrThreshold = int(0.1 + nback)
+                # if countRetrThreshold > 5:
+                #     countRetrThreshold = 5
+                #
+                # countRetr=0
+                # min_threshold = 0.90
+                # countRetr = (df_retrieved['score'] > min_threshold).sum()
+                # while countRetr<=countRetrThreshold:
+                #     min_threshold = min_threshold - 0.05
+                #     if min_threshold<0.7:
+                #         countRetrThreshold=0
+                #     if min_threshold == 0:
+                #         min_threshold = 0.01
+                #     countRetr = (df_retrieved['score'] > min_threshold).sum()
+                #     if min_threshold <= 0.01:
+                #         break
+                if countRetr > 0:
+                    df_retrieved = df_retrieved[df_retrieved['score'] > min_threshold]
+                    # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
+                    labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
+                    labelTriplesAPP = ". ".join(
+                        " ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
+                    if not labelTriples:
+                        labelTriples = labelTriplesAPP
+                    else:
+                        labelTriples = labelTriples + ". " + labelTriplesAPP
+                else:
+                    labelTriplesLIST_RAGGED = []
+                    labelTriples = ""
+            if strtobool(args.debug):
+                numfinal = 0
+                if labelTriplesLIST_RAGGED:
+                    numfinal = len(labelTriplesLIST_RAGGED)
+                print("End reranking2 - found final passages : ", numfinal, "\n")
+                endRerank2 = time.time()
+                hours, rem = divmod(endRerank2 - startRerank2, 3600)
+                minutes, seconds = divmod(rem, 60)
+                print("Rerank2 Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds))
+                #
+        except Exception as err:
+            print("SOMETHING HAPPENED on PASSAGE RERANKING for Question :"+questionText+"\n")
+            print(err)
+            #status_code: 422, body: type='validation_error' url='https://www.mixedbread.ai/api-reference' message='Your request is invalid. Please check your input and try again.' details=[[{'type': 'too_long', 'loc': ['body', 'input', 'list[str]'], 'msg': 'List should have at most 1000 items after validation, not 4249',
     else:  # NO RAG on triples
     if entityBioeUrl:
+        if strtobool(args.computeEntityContext) and (strtobool(args.computeEntityGlobalContext)==False):
             if strtobool(args.debug):
                 print("START computeEntityContext")
             if not globalContext:
+                BreakenBeforeAll = False
                 if unique_listGlobalTriples:
                     globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples,
                                                                                                     text_splitter, args,
                             return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
                     if not ALLURIScontext:
                         # Print the error message to stderr
                         print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
                             if unique_listLabelTriples:
                                 unique_listGlobalTriples.extend(unique_listLabelTriples)
+                                # This is to speed up, so I break here the global, but in this case I will not store the triples for the other uris in the cache, which maybe useful in the future
+                                # #if token_counter(str(unique_listGlobalTriples),args.model_name) > args.tokens_max:
+                                if getattr(args, 'maxTriplesContextComputation', None):  #it means it exists
+                                    if args.maxTriplesContextComputation > 0:
+                                        if len(unique_listGlobalTriples) > args.maxTriplesContextComputation:
+                                            unique_listGlobalTriples = unique_listGlobalTriples[:args.maxTriplesContextComputation]
+                                            BreakenBeforeAll = True
+                                            break # BREAK THE FOR LOOP IF THE GLOBAL CONTEXT IS ALREADY TOO BIG, BIGGER THAN tokens_max
                     if unique_listGlobalTriples:
                         if cache_map_virtuoso is not None:
                             if not word in cache_map_virtuoso:
                                 cache_map_virtuoso[word] = {}
+                            if BreakenBeforeAll == False:
+                                cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()] = unique_listGlobalTriples
                         globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word,
                                                                                                         unique_listGlobalTriples,
                     if cache_map_virtuoso is not None:
                         if not word in cache_map_virtuoso:
                             cache_map_virtuoso[word] = {}
+                        if BreakenBeforeAll == False:
+                            cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()] = globalContext
     if unique_listLabelTriples:
         sssingleTriples = " ,., ".join(
         parser.add_argument("--computeEntityContext", type=str, default="False", help="whether to extract a readable context from the extracted triples for the concept")
         parser.add_argument("--computeEntityGlobalContext", type=str, default="False", help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
+        parser.add_argument("--maxTriplesContextComputation", type=int, default=20000,
+                            help="maximum number of triples to consider for global context computation")  # if 0 or None it is not considered
         parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
                             help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")

retrieverRAG_SF.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# https://www.mixedbread.ai/blog/mxbai-embed-large-v1
+# https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
+import os
+import time
+import pandas as pd
+import numpy as np
+from typing import Dict
+import torch
+from transformers import AutoModel, AutoTokenizer
+from sentence_transformers.util import cos_sim
+from accelerate import Accelerator  # Import from accelerate
+from scipy.stats import zscore
+# Set up environment variables for Hugging Face caching
+os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
+# Initialize the Accelerator
+accelerator = Accelerator()
+# Use the device managed by Accelerator
+device = accelerator.device
+print("Using accelerator device =", device)
+from sentence_transformers import CrossEncoder
+model_sf_mxbai = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1" ,device=device)
+def RAG_retrieval_Base(queryText ,passages, min_threshold=0.0, max_num_passages=None):
+    # # Example query
+    # query = "What is the capital of France?"
+    #
+    # # Example passages
+    # ppppassages = [
+    #     "This is the first passage.",
+    #     "The capital of France is Paris.",
+    #     "This is the third passage.",
+    #     "Paris is a beautiful city.",
+    #     "The Eiffel Tower is in Paris."
+    # ]
+    #
+    # # Rank the passages with respect to the query
+    # ranked_passages = model_sf_mxbai.rank(query, ppppassages)
+    try:
+        df_filtered = pd.DataFrame()
+        if max_num_passages:
+            result_rerank = model_sf_mxbai.rank(queryText, passages, return_documents=False, top_k=max_num_passages)
+        else:
+            nback =int(0.1 *len(passages)) # 10% of the number of passages
+            if nback<=0:
+                nback=1
+            result_rerank = model_sf_mxbai.rank(queryText, passages, return_documents=False, top_k=nback)
+        if result_rerank:
+            df = pd.DataFrame(result_rerank)  # corpus_id, score
+            if min_threshold >0:
+                df_filtered = df[df['score'] >= min_threshold]
+            else:
+                df_filtered =df.copy()
+        selected_passages = [passages[i] for i in df_filtered['corpus_id']]
+        # Add the selected passages as a new column "Passage" to the DataFrame
+        df_filtered['Passage'] = selected_passages
+        df_filtered = df_filtered.drop_duplicates(subset='Passage', keep='first')
+        # df_filtered = df_filtered.sort_values(by='score', ascending=False)
+        # Return the filtered DataFrame
+        return df_filtered
+    except Exception as e:
+        # Log the exception message or handle it as needed
+        print(f"An error occurred: {e}")
+        return pd.DataFrame()  # Return an empty DataFrame in case of error
+if __name__ == '__main__':
+    queryText = 'A man is eating a piece of bread'
+    # Define the passages list
+    passages = [
+        "A man is eating food.",
+        "A man is eating pasta.",
+        "The girl is carrying a baby.",
+        "A man is riding a horse.",
+    ]
+    df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0, max_num_passages=3)
+    print(df_retrieved)
+    print("end of computations")

virtuosoQueryRest.py CHANGED Viewed

@@ -3,10 +3,14 @@ from requests.auth import HTTPDigestAuth, HTTPBasicAuth
 import ssl
 import json
 def execute_query(endpoint, query, auth):
     headers = {
         'Content-Type': 'application/x-www-form-urlencoded',

 import ssl
 import json
+from joblib import Memory
+cachedir = 'cached'
+mem = Memory(cachedir, verbose=False)
+@mem.cache
 def execute_query(endpoint, query, auth):
     headers = {
         'Content-Type': 'application/x-www-form-urlencoded',