Consoli Sergio commited on
Commit
66b8c66
·
1 Parent(s): eff93c6

corrected bug on double repetition on history

Browse files
Files changed (4) hide show
  1. app-demo-myMultiNER.py +103 -43
  2. nerBio.py +263 -118
  3. retrieverRAG_SF.py +114 -0
  4. virtuosoQueryRest.py +4 -0
app-demo-myMultiNER.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
2
 
3
- # os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
4
- #
5
- # os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
6
- # os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
7
- # os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
8
 
9
  from transformers import file_utils
10
  print(file_utils.default_cache_path)
@@ -76,8 +76,8 @@ examples = [
76
 
77
 
78
 
79
- #models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
80
- models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
81
  #models_List = ["NCBO/BioPortal" ]
82
 
83
  #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
@@ -189,7 +189,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
189
  state = {
190
  "text": "",
191
  "df_annotated_dict": dict(),
192
- "KGchoices": KGchoices
 
 
 
 
 
193
  }
194
  return {"text": text, "entities": []}, html_output, state, [], ""
195
 
@@ -224,7 +229,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
224
  help="List of ontologies to which restrict the entity linking task.")
225
  #consose 20250502:
226
  if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
227
- parser.add_argument("--USE_CACHE", type=str, default="False",
228
  help="whether to use cache for the NER and NEL tasks or not")
229
  else:
230
  #print("Lists do not have the same elements")
@@ -237,6 +242,8 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
237
  help="whether to extract a readable context from the extracted triples for the concept")
238
  parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
239
  help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
 
 
240
  parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
241
  help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
242
 
@@ -257,7 +264,39 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
257
  if state:
258
  previous_text = state.get("text", "")
259
  previous_df_annotated_dict = state.get("df_annotated_dict", {})
 
260
  previous_kg_choices = state.get("KGchoices", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
263
  #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
@@ -319,7 +358,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
319
  state = {
320
  "text": text,
321
  "df_annotated_dict": df_annotated.to_dict(),
322
- "KGchoices": KGchoices
 
 
 
 
 
323
  }
324
 
325
  else:
@@ -341,7 +385,12 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
341
  state = {
342
  "text": text,
343
  "df_annotated_dict": df_annotated.to_dict(),
344
- "KGchoices": KGchoices
 
 
 
 
 
345
  }
346
 
347
 
@@ -353,6 +402,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
353
  df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
354
  if df_annotated.empty and quoted_text==False:
355
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
 
356
  return {"text": text, "entities": []}, html_output, state, [], ""
357
 
358
  df_annotated_combined = pd.DataFrame()
@@ -360,6 +410,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
360
  df_annotated_combined = entitiesFusion(df_annotated,args)
361
  if df_annotated_combined.empty and quoted_text==False:
362
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
 
363
  return {"text": text, "entities": []}, html_output, state, [], ""
364
  else:
365
  if (not df_annotated.empty):
@@ -530,6 +581,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
530
  df_annotated_combined = df_annotated_combined[filter_mask]
531
  if df_annotated_combined.empty:
532
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
 
533
  return {"text": text, "entities": []}, html_output, state, [], ""
534
 
535
  ###
@@ -540,6 +592,7 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
540
  df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
541
  if df_annotated_combined.empty:
542
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
 
543
  return {"text": text, "entities": []}, html_output, state, [], ""
544
 
545
  dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
@@ -550,15 +603,15 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
550
 
551
  # # Create a new column for the entities with links
552
  df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
553
- #lambda row: (
554
- # f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
555
- # if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
556
- # 'word']
557
- #),
558
  lambda row: (
559
- f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
560
- if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
561
- 'word']
562
  ),
563
  axis=1
564
  )
@@ -641,17 +694,20 @@ def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking,
641
  words_for_dropdown = []
642
 
643
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
 
644
 
645
  #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
646
  return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
647
 
648
  else:
649
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
 
650
  return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
651
 
652
  else:
653
 
654
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
 
655
  return {"text": text, "entities": []}, html_output, state, [], ""
656
 
657
 
@@ -663,28 +719,32 @@ def update_urls(selected_word, state):
663
  # Convert the state dictionary back into a DataFrame
664
  df = pd.DataFrame(state["df_annotated_combined_dict"])
665
 
666
- # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
667
- # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
668
- # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
669
- valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
670
-
671
- # Check if the selected word is in the filtered DataFrame
672
- if selected_word in valid_entries['word'].values:
673
- urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
674
- if 'namedEntity' in df.columns:
675
- firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
676
- firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
677
- if firsturlinlist and firsturlinlist in urls:
678
- # Remove the URL from its current position
679
- urls.remove(firsturlinlist)
680
- # Insert the URL at the first position
681
- urls.insert(0, firsturlinlist)
682
-
683
- # Convert list of URLs to HTML string with clickable links
684
- #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
685
- html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
686
- return html_links
687
- return ""
 
 
 
 
688
 
689
  else:
690
  return ""
@@ -768,7 +828,7 @@ with gr.Blocks(title="BioAnnotator") as demo:
768
  text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
769
  models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
770
  categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
771
- score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7)
772
  nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
773
  kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
774
  state = gr.State(value={})
@@ -824,4 +884,4 @@ with gr.Blocks(title="BioAnnotator") as demo:
824
 
825
 
826
  demo.launch()
827
- #demo.launch(share=True) # Share your demo with just 1 extra parameter
 
1
  import os
2
 
3
+ #os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only
4
+
5
+ #os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
6
+ #os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
7
+ #os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
8
 
9
  from transformers import file_utils
10
  print(file_utils.default_cache_path)
 
76
 
77
 
78
 
79
+ models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
80
+ #models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5"
81
  #models_List = ["NCBO/BioPortal" ]
82
 
83
  #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"]
 
189
  state = {
190
  "text": "",
191
  "df_annotated_dict": dict(),
192
+ "df_annotated_combined_dict": dict(),
193
+ "KGchoices": KGchoices,
194
+ "ModelsSelection": ModelsSelection,
195
+ "ScoreFilt": ScoreFilt,
196
+ "EntityLinking": EntityLinking,
197
+ "html_output": html_output
198
  }
199
  return {"text": text, "entities": []}, html_output, state, [], ""
200
 
 
229
  help="List of ontologies to which restrict the entity linking task.")
230
  #consose 20250502:
231
  if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List):
232
+ parser.add_argument("--USE_CACHE", type=str, default="True",
233
  help="whether to use cache for the NER and NEL tasks or not")
234
  else:
235
  #print("Lists do not have the same elements")
 
242
  help="whether to extract a readable context from the extracted triples for the concept")
243
  parser.add_argument("--computeEntityGlobalContext", type=str, default="False",
244
  help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
245
+ parser.add_argument("--maxTriplesGlobalContext", type=int, default=20000,
246
+ help="maximum number of triples to consider for global context computation") # if 0 or None it is not considered
247
  parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
248
  help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
249
 
 
264
  if state:
265
  previous_text = state.get("text", "")
266
  previous_df_annotated_dict = state.get("df_annotated_dict", {})
267
+ previous_df_annotated_combined_dict = state.get("df_annotated_combined_dict", {})
268
  previous_kg_choices = state.get("KGchoices", [])
269
+ previous_ModelsSelection = state.get("ModelsSelection", [])
270
+ previous_ScoreFilt_from_state = float(state.get("ScoreFilt", ScoreFilt)) # Ensure ScoreFilt is a float
271
+ previous_EntityLinking_from_state = bool(state.get("EntityLinking", EntityLinking)) # Ensure EntityLinking is a boolean
272
+ previous_html_output = state.get("html_output", "")
273
+
274
+
275
+ if previous_html_output and (previous_df_annotated_dict) and (previous_df_annotated_combined_dict) and (previous_text == text) and (sorted(previous_kg_choices) == sorted(KGchoices)) and (sorted(previous_ModelsSelection) == sorted(ModelsSelection)) and (previous_ScoreFilt_from_state == ScoreFilt) and (previous_EntityLinking_from_state == EntityLinking):
276
+ ddf_annot_prev = pd.DataFrame(previous_df_annotated_combined_dict)
277
+ if 'ALLURIScontext' in ddf_annot_prev.columns:
278
+ # words_for_dropdown = df_annotated_combined[
279
+ # df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][
280
+ # 'word'].unique().tolist()
281
+ words_for_dropdown = ddf_annot_prev[ddf_annot_prev['ALLURIScontext'].apply(
282
+ lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (
283
+ isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip()))))][
284
+ 'word'].unique().tolist()
285
+ words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values())
286
+ words_for_dropdown.insert(0, "")
287
+ else:
288
+ words_for_dropdown = []
289
+
290
+ dict_annotated_combined_NER = ddf_annot_prev[
291
+ ["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
292
+
293
+ # return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
294
+ return {"text": text, "entities": dict_annotated_combined_NER}, previous_html_output, state, gr.update(
295
+ choices=words_for_dropdown), ""
296
+
297
+
298
+
299
+
300
 
301
  #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row)
302
  #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False):
 
358
  state = {
359
  "text": text,
360
  "df_annotated_dict": df_annotated.to_dict(),
361
+ "df_annotated_combined_dict": dict(),
362
+ "KGchoices": KGchoices,
363
+ "ModelsSelection": ModelsSelection,
364
+ "ScoreFilt": ScoreFilt,
365
+ "EntityLinking": EntityLinking,
366
+ "html_output": ""
367
  }
368
 
369
  else:
 
385
  state = {
386
  "text": text,
387
  "df_annotated_dict": df_annotated.to_dict(),
388
+ "df_annotated_combined_dict": dict(),
389
+ "KGchoices": KGchoices,
390
+ "ModelsSelection": ModelsSelection,
391
+ "ScoreFilt": ScoreFilt,
392
+ "EntityLinking": EntityLinking,
393
+ "html_output": ""
394
  }
395
 
396
 
 
402
  df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])]
403
  if df_annotated.empty and quoted_text==False:
404
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
405
+ state["html_output"] = html_output
406
  return {"text": text, "entities": []}, html_output, state, [], ""
407
 
408
  df_annotated_combined = pd.DataFrame()
 
410
  df_annotated_combined = entitiesFusion(df_annotated,args)
411
  if df_annotated_combined.empty and quoted_text==False:
412
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
413
+ state["html_output"] = html_output
414
  return {"text": text, "entities": []}, html_output, state, [], ""
415
  else:
416
  if (not df_annotated.empty):
 
581
  df_annotated_combined = df_annotated_combined[filter_mask]
582
  if df_annotated_combined.empty:
583
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
584
+ state["html_output"] = html_output
585
  return {"text": text, "entities": []}, html_output, state, [], ""
586
 
587
  ###
 
592
  df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1]
593
  if df_annotated_combined.empty:
594
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
595
+ state["html_output"] = html_output
596
  return {"text": text, "entities": []}, html_output, state, [], ""
597
 
598
  dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records")
 
603
 
604
  # # Create a new column for the entities with links
605
  df_annotated_combined['entity_with_link'] = df_annotated_combined.apply(
606
+ # lambda row: (
607
+ # f"<a href='https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={row['namedEntity']}' target='_blank'>{row['word']}</a>"
608
+ # if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
609
+ # 'word']
610
+ # ),
611
  lambda row: (
612
+ f"<a href='https://api-vast.jrc.service.ec.europa.eu/describe//?url={row['namedEntity']}' target='_blank'>{row['word']}</a>"
613
+ if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[
614
+ 'word']
615
  ),
616
  axis=1
617
  )
 
694
  words_for_dropdown = []
695
 
696
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text_with_links}</div>"
697
+ state["html_output"] = html_output
698
 
699
  #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state
700
  return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), ""
701
 
702
  else:
703
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
704
+ state["html_output"] = html_output
705
  return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], ""
706
 
707
  else:
708
 
709
  html_output = f"<div class='gr-textbox' style='white-space: pre-wrap; overflow-wrap: break-word; padding: 10px; border: 1px solid #ddd; border-radius: 5px; font-family: monospace; font-size: 12px; line-height: 24px;'>{text}</div>"
710
+ state["html_output"] = html_output
711
  return {"text": text, "entities": []}, html_output, state, [], ""
712
 
713
 
 
719
  # Convert the state dictionary back into a DataFrame
720
  df = pd.DataFrame(state["df_annotated_combined_dict"])
721
 
722
+ if 'ALLURIScontext' in df.columns:
723
+ # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None
724
+ # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])]
725
+ # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string
726
+ valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]
727
+
728
+ # Check if the selected word is in the filtered DataFrame
729
+ if selected_word in valid_entries['word'].values:
730
+ urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0]
731
+ if 'namedEntity' in df.columns:
732
+ firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity']
733
+ firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None
734
+ if firsturlinlist and firsturlinlist in urls:
735
+ # Remove the URL from its current position
736
+ urls.remove(firsturlinlist)
737
+ # Insert the URL at the first position
738
+ urls.insert(0, firsturlinlist)
739
+
740
+ # Convert list of URLs to HTML string with clickable links
741
+ #html_links = "<br>".join([f'<a href="https://expl-rels-dev-vast.apps.ocpt.jrc.ec.europa.eu/?concept={url}" target="_blank">{url}</a>' for url in urls])
742
+ html_links = "<br>".join([f'<a href="https://api-vast.jrc.service.ec.europa.eu/describe//?url={url}" target="_blank">{url}</a>' for url in urls])
743
+ return html_links
744
+ return ""
745
+ else:
746
+ return""
747
+
748
 
749
  else:
750
  return ""
 
828
  text_input = gr.Textbox(label="Input text", placeholder="Enter text here...")
829
  models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List)
830
  categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List)
831
+ score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.05, label="Score", value=0.75)
832
  nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False)
833
  kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List)
834
  state = gr.State(value={})
 
884
 
885
 
886
  demo.launch()
887
+ #demo.launch(share=True) # Share your demo with just 1 extra parameter
nerBio.py CHANGED
@@ -65,7 +65,8 @@ import json
65
  import random
66
  import numpy as np
67
 
68
- from retrieverRAG_testing import RAG_retrieval_Base, RAG_retrieval_Z_scores, RAG_retrieval_Percentile, RAG_retrieval_TopK
 
69
 
70
  from joblib import Memory
71
 
@@ -957,135 +958,265 @@ def getLinearTextualContextFromTriples(word,labelTriplesLIST, text_splitter, arg
957
  word = word.lower()
958
  word = word.capitalize()
959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960
 
961
- if (strtobool(args.UseRetrieverForContextCreation)==True):
962
  labelTriples = ""
963
- passages = []
964
- nn = 200
965
 
966
- if len(labelTriplesLIST)<=nn:
 
967
  passages = []
968
  for i, triple in enumerate(labelTriplesLIST, start=1):
969
  # for triple in labelTriplesLIST:
970
  TriplesString = (" ".join(str(element).capitalize() for element in triple))
971
  passages.append(TriplesString)
972
 
973
- df_retrieved = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
974
- min_threshold=0.7)
 
 
 
 
 
 
 
 
 
975
 
976
- if not df_retrieved.empty:
977
- #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
978
- labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
979
- labelTriplesAPP = ". ".join(
980
- " ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
981
 
982
- if not labelTriples:
983
- labelTriples = labelTriplesAPP
984
- else:
985
- labelTriples = labelTriples + ". " + labelTriplesAPP
986
 
987
- else:
 
 
988
 
989
- OverallListRAGtriples = labelTriplesLIST.copy()
990
-
991
- while len(OverallListRAGtriples)>nn:
992
- Oinnerlistiterative=[]
993
- for i, triple in enumerate(OverallListRAGtriples, start=1):
994
- # for triple in labelTriplesLIST:
995
- TriplesString = (" ".join(str(element).capitalize() for element in triple))
996
- passages.append(TriplesString)
997
- # Check if the current index is a multiple of nn
998
- if i % nn == 0:
999
- # print("elaborate RAG triples")
1000
-
1001
- # df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
1002
- # df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
1003
- # df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
1004
- df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
1005
- min_threshold=0.7)
1006
-
1007
- passages = []
1008
-
1009
- df_retrieved = df_retrievedtopk.copy()
1010
- if not df_retrieved.empty:
1011
- #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1012
- labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1013
- if not Oinnerlistiterative:
1014
- Oinnerlistiterative=labelTriplesLIST_RAGGED
1015
- else:
1016
- Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
1017
-
1018
- if passages:
1019
- df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
1020
- min_threshold=0.7)
1021
-
1022
- df_retrieved = df_retrievedtopk.copy()
1023
- if not df_retrieved.empty:
1024
- #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1025
- labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1026
- if not Oinnerlistiterative:
1027
- Oinnerlistiterative = labelTriplesLIST_RAGGED
1028
- else:
1029
- Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
1030
-
1031
- OverallListRAGtriples = Oinnerlistiterative.copy()
1032
-
1033
- if OverallListRAGtriples:
1034
- labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in OverallListRAGtriples)
1035
-
1036
- if not labelTriples:
1037
- labelTriples = labelTriplesAPP
1038
  else:
1039
- labelTriples = labelTriples + ". " + labelTriplesAPP
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1040
 
1041
- labelTriples = labelTriples.strip().replace("..", ".").strip()
 
 
1042
 
1043
 
 
 
 
 
 
 
 
 
 
 
1044
 
1045
- # labelTriples = ""
1046
- # passages = []
1047
- # nn=200
1048
- # for i, triple in enumerate(labelTriplesLIST, start=1):
1049
- # #for triple in labelTriplesLIST:
1050
- # TriplesString = (" ".join(str(element).capitalize() for element in triple))
1051
- # passages.append(TriplesString)
1052
- # # Check if the current index is a multiple of nn
1053
- # if i % nn == 0:
1054
- # #print("elaborate RAG triples")
1055
- #
1056
- # #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
1057
- # #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
1058
- # #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
1059
- # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
1060
- #
1061
- # passages = []
1062
- #
1063
- # df_retrieved = df_retrievedtopk.copy()
1064
- # if not df_retrieved.empty:
1065
- # #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1066
- # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1067
- # labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1068
- #
1069
- # if not labelTriples:
1070
- # labelTriples =labelTriplesAPP
1071
- # else:
1072
- # labelTriples = labelTriples + ". " + labelTriplesAPP
1073
- #
1074
- # if passages:
1075
- # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
1076
- #
1077
- # df_retrieved = df_retrievedtopk.copy()
1078
- # if not df_retrieved.empty:
1079
- # #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1080
- # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1081
- # labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1082
- # if not labelTriples:
1083
- # labelTriples = labelTriplesAPP
1084
- # else:
1085
- # labelTriples = labelTriples + ". " + labelTriplesAPP
1086
- #
1087
- # if labelTriples:
1088
- # labelTriples.strip().replace("..",".").strip()
1089
 
1090
 
1091
  else: # NO RAG on triples
@@ -1571,7 +1702,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1571
 
1572
  if entityBioeUrl:
1573
 
1574
- if strtobool(args.computeEntityContext):
1575
 
1576
  if strtobool(args.debug):
1577
  print("START computeEntityContext")
@@ -1706,6 +1837,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1706
 
1707
 
1708
  if not globalContext:
 
 
1709
  if unique_listGlobalTriples:
1710
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples,
1711
  text_splitter, args,
@@ -1750,6 +1883,7 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1750
  return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
1751
 
1752
 
 
1753
  if not ALLURIScontext:
1754
  # Print the error message to stderr
1755
  print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
@@ -1879,9 +2013,16 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1879
  if unique_listLabelTriples:
1880
  unique_listGlobalTriples.extend(unique_listLabelTriples)
1881
 
1882
- # If I want to speed up, I can break here, but in this case I will not store the triples for the other uris in the cache, which maybe useful in the future
1883
- # if token_counter(str(unique_listGlobalTriples),args.model_name) > args.tokens_max:
1884
- # break # BREAK THE FOR LOOP IF THE GLOBAL CONTEXT IS ALREADY TOO BIG, BIGGER THAN tokens_max
 
 
 
 
 
 
 
1885
 
1886
 
1887
  if unique_listGlobalTriples:
@@ -1892,7 +2033,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1892
  if cache_map_virtuoso is not None:
1893
  if not word in cache_map_virtuoso:
1894
  cache_map_virtuoso[word] = {}
1895
- cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()] = unique_listGlobalTriples
 
1896
 
1897
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word,
1898
  unique_listGlobalTriples,
@@ -1903,7 +2045,8 @@ def virtuoso_api_call(word, text_splitter, args, key_virtuoso, cache_map_virtuos
1903
  if cache_map_virtuoso is not None:
1904
  if not word in cache_map_virtuoso:
1905
  cache_map_virtuoso[word] = {}
1906
- cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()] = globalContext
 
1907
 
1908
  if unique_listLabelTriples:
1909
  sssingleTriples = " ,., ".join(
@@ -2291,6 +2434,8 @@ if __name__ == '__main__':
2291
 
2292
  parser.add_argument("--computeEntityContext", type=str, default="False", help="whether to extract a readable context from the extracted triples for the concept")
2293
  parser.add_argument("--computeEntityGlobalContext", type=str, default="False", help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
 
 
2294
  parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
2295
  help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
2296
 
 
65
  import random
66
  import numpy as np
67
 
68
+ #from retrieverRAG_testing import RAG_retrieval_Base, RAG_retrieval_Z_scores, RAG_retrieval_Percentile, RAG_retrieval_TopK, retrievePassageSimilarities
69
+ from retrieverRAG_SF import RAG_retrieval_Base
70
 
71
  from joblib import Memory
72
 
 
958
  word = word.lower()
959
  word = word.capitalize()
960
 
961
+ labelTriples=""
962
+
963
+ if labelTriplesLIST and getattr(args, 'maxTriplesContextComputation', None): # it means it exists
964
+ if args.maxTriplesContextComputation > 0:
965
+ if len(labelTriplesLIST) > args.maxTriplesContextComputation:
966
+ labelTriplesLIST = labelTriplesLIST[:args.maxTriplesContextComputation]
967
+
968
+ if (strtobool(args.UseRetrieverForContextCreation) == True):
969
+
970
+ # if strtobool(args.debug):
971
+ # print("Start reranking - num passages : ", len(labelTriplesLIST), "\n")
972
+ # startRerank = time.time()
973
+ #
974
+ # labelTriples = ""
975
+ # passages = []
976
+ # nn = 200
977
+ #
978
+ # OverallListRAGtriples = []
979
+ # labelTriplesLIST_RAGGED = []
980
+ #
981
+ # if len(labelTriplesLIST) <= nn:
982
+ # passages = []
983
+ # for i, triple in enumerate(labelTriplesLIST, start=1):
984
+ # # for triple in labelTriplesLIST:
985
+ # TriplesString = (" ".join(str(element).capitalize() for element in triple))
986
+ # passages.append(TriplesString)
987
+ #
988
+ # df_retrieved = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
989
+ # min_threshold=0.7)
990
+ #
991
+ # if not df_retrieved.empty:
992
+ # # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
993
+ # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
994
+ # labelTriplesAPP = ". ".join(
995
+ # " ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
996
+ #
997
+ # if not labelTriples:
998
+ # labelTriples = labelTriplesAPP
999
+ # else:
1000
+ # labelTriples = labelTriples + ". " + labelTriplesAPP
1001
+ #
1002
+ # else:
1003
+ #
1004
+ # OverallListRAGtriples = labelTriplesLIST.copy()
1005
+ #
1006
+ # while len(OverallListRAGtriples) > nn:
1007
+ # Oinnerlistiterative = []
1008
+ # for i, triple in enumerate(OverallListRAGtriples, start=1):
1009
+ # # for triple in labelTriplesLIST:
1010
+ # TriplesString = (" ".join(str(element).capitalize() for element in triple))
1011
+ # passages.append(TriplesString)
1012
+ # # Check if the current index is a multiple of nn
1013
+ # if i % nn == 0:
1014
+ # # print("elaborate RAG triples")
1015
+ #
1016
+ # # df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
1017
+ # # df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
1018
+ # # df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
1019
+ # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1,
1020
+ # max_num_passages=20,
1021
+ # min_threshold=0.7)
1022
+ #
1023
+ # passages = []
1024
+ #
1025
+ # df_retrieved = df_retrievedtopk.copy()
1026
+ # if not df_retrieved.empty:
1027
+ # # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1028
+ # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1029
+ # if not Oinnerlistiterative:
1030
+ # Oinnerlistiterative = labelTriplesLIST_RAGGED
1031
+ # else:
1032
+ # Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
1033
+ #
1034
+ # if passages:
1035
+ # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20,
1036
+ # min_threshold=0.7)
1037
+ #
1038
+ # df_retrieved = df_retrievedtopk.copy()
1039
+ # if not df_retrieved.empty:
1040
+ # # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1041
+ # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1042
+ # if not Oinnerlistiterative:
1043
+ # Oinnerlistiterative = labelTriplesLIST_RAGGED
1044
+ # else:
1045
+ # Oinnerlistiterative.extend(labelTriplesLIST_RAGGED)
1046
+ #
1047
+ # OverallListRAGtriples = Oinnerlistiterative.copy()
1048
+ #
1049
+ # if OverallListRAGtriples:
1050
+ # labelTriplesAPP = ". ".join(
1051
+ # " ".join(str(element).capitalize() for element in triple) for triple in OverallListRAGtriples)
1052
+ #
1053
+ # if not labelTriples:
1054
+ # labelTriples = labelTriplesAPP
1055
+ # else:
1056
+ # labelTriples = labelTriples + ". " + labelTriplesAPP
1057
+ #
1058
+ # labelTriples = labelTriples.strip().replace("..", ".").strip()
1059
+ #
1060
+ # # labelTriples = ""
1061
+ # # passages = []
1062
+ # # nn=200
1063
+ # # for i, triple in enumerate(labelTriplesLIST, start=1):
1064
+ # # #for triple in labelTriplesLIST:
1065
+ # # TriplesString = (" ".join(str(element).capitalize() for element in triple))
1066
+ # # passages.append(TriplesString)
1067
+ # # # Check if the current index is a multiple of nn
1068
+ # # if i % nn == 0:
1069
+ # # #print("elaborate RAG triples")
1070
+ # #
1071
+ # # #df_retrieved_Base = RAG_retrieval_Base(questionText, passages, min_threshold=0.7, max_num_passages=20)
1072
+ # # #df_retrievedZscore = RAG_retrieval_Z_scores(questionText, passages, z_threshold=1.0, max_num_passages=20, min_threshold=0.7)
1073
+ # # #df_retrievedPercentile = RAG_retrieval_Percentile(questionText, passages, percentile=90, max_num_passages=20, min_threshold=0.7)
1074
+ # # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
1075
+ # #
1076
+ # # passages = []
1077
+ # #
1078
+ # # df_retrieved = df_retrievedtopk.copy()
1079
+ # # if not df_retrieved.empty:
1080
+ # # #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1081
+ # # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1082
+ # # labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1083
+ # #
1084
+ # # if not labelTriples:
1085
+ # # labelTriples =labelTriplesAPP
1086
+ # # else:
1087
+ # # labelTriples = labelTriples + ". " + labelTriplesAPP
1088
+ # #
1089
+ # # if passages:
1090
+ # # df_retrievedtopk = RAG_retrieval_TopK(questionText, passages, top_fraction=0.1, max_num_passages=20, min_threshold=0.7)
1091
+ # #
1092
+ # # df_retrieved = df_retrievedtopk.copy()
1093
+ # # if not df_retrieved.empty:
1094
+ # # #labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1095
+ # # labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1096
+ # # labelTriplesAPP = ". ".join(" ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1097
+ # # if not labelTriples:
1098
+ # # labelTriples = labelTriplesAPP
1099
+ # # else:
1100
+ # # labelTriples = labelTriples + ". " + labelTriplesAPP
1101
+ # #
1102
+ # # if labelTriples:
1103
+ # # labelTriples.strip().replace("..",".").strip()
1104
+ #
1105
+ # if strtobool(args.debug):
1106
+ # numfinal = 0
1107
+ # if OverallListRAGtriples:
1108
+ # numfinal = len(OverallListRAGtriples)
1109
+ # elif labelTriplesLIST_RAGGED:
1110
+ # numfinal = len(labelTriplesLIST_RAGGED)
1111
+ # print("End reranking - found final passages : ", numfinal, "\n")
1112
+ # #
1113
+ # endRerank = time.time()
1114
+ # hours, rem = divmod(endRerank - startRerank, 3600)
1115
+ # minutes, seconds = divmod(rem, 60)
1116
+ # print("Rerank Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds))
1117
+ # #
1118
+
1119
+ # if len(labelTriplesLIST) > 10000:
1120
+ # print("debug")
1121
+
1122
+ if strtobool(args.debug):
1123
+ print("Start reranking2 - num passages : ", len(labelTriplesLIST), "\n")
1124
+ startRerank2 = time.time()
1125
 
 
1126
  labelTriples = ""
 
 
1127
 
1128
+ try:
1129
+
1130
  passages = []
1131
  for i, triple in enumerate(labelTriplesLIST, start=1):
1132
  # for triple in labelTriplesLIST:
1133
  TriplesString = (" ".join(str(element).capitalize() for element in triple))
1134
  passages.append(TriplesString)
1135
 
1136
+ nback = 1
1137
+ if len(passages) <= 10:
1138
+ nback = len(passages)
1139
+ elif len(passages) <= 1000:
1140
+ nback = 10+int(0.1 * len(passages)) # 10% of the number of passages
1141
+ elif len(passages) <= 5000:
1142
+ nback = 200
1143
+ elif len(passages) <= 10000:
1144
+ nback = 300
1145
+ else:
1146
+ nback = 400
1147
 
1148
+ df_retrieved = RAG_retrieval_Base(questionText, passages, min_threshold=0, max_num_passages=nback)
 
 
 
 
1149
 
1150
+ if not df_retrieved.empty:
 
 
 
1151
 
1152
+ countRetr = 0
1153
+ min_threshold = 0.80
1154
+ countRetr = (df_retrieved['score'] > min_threshold).sum()
1155
 
1156
+ countRetrThreshold = int(nback / 2)
1157
+ if nback > 10:
1158
+ countRetrThreshold = 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
  else:
1160
+ countRetrThreshold = int(nback/2)
1161
+ if countRetrThreshold <=0:
1162
+ countRetrThreshold = 1
1163
+
1164
+ while countRetr <= countRetrThreshold:
1165
+ min_threshold = min_threshold - 0.05
1166
+ countRetr = (df_retrieved['score'] >= min_threshold).sum()
1167
+ if min_threshold < 0.2:
1168
+ break
1169
+
1170
+ # countRetrThreshold = int(0.1 + nback)
1171
+ # if countRetrThreshold > 5:
1172
+ # countRetrThreshold = 5
1173
+ #
1174
+ # countRetr=0
1175
+ # min_threshold = 0.90
1176
+ # countRetr = (df_retrieved['score'] > min_threshold).sum()
1177
+ # while countRetr<=countRetrThreshold:
1178
+ # min_threshold = min_threshold - 0.05
1179
+ # if min_threshold<0.7:
1180
+ # countRetrThreshold=0
1181
+ # if min_threshold == 0:
1182
+ # min_threshold = 0.01
1183
+ # countRetr = (df_retrieved['score'] > min_threshold).sum()
1184
+ # if min_threshold <= 0.01:
1185
+ # break
1186
+
1187
+ if countRetr > 0:
1188
+ df_retrieved = df_retrieved[df_retrieved['score'] > min_threshold]
1189
+
1190
+ # labelTriplesLIST_RAGGED = df_retrieved.to_records(index=False).tolist()
1191
+ labelTriplesLIST_RAGGED = df_retrieved['Passage'].apply(lambda x: (x,)).tolist()
1192
+ labelTriplesAPP = ". ".join(
1193
+ " ".join(str(element).capitalize() for element in triple) for triple in labelTriplesLIST_RAGGED)
1194
+
1195
+ if not labelTriples:
1196
+ labelTriples = labelTriplesAPP
1197
+ else:
1198
+ labelTriples = labelTriples + ". " + labelTriplesAPP
1199
 
1200
+ else:
1201
+ labelTriplesLIST_RAGGED = []
1202
+ labelTriples = ""
1203
 
1204
 
1205
+ if strtobool(args.debug):
1206
+ numfinal = 0
1207
+ if labelTriplesLIST_RAGGED:
1208
+ numfinal = len(labelTriplesLIST_RAGGED)
1209
+ print("End reranking2 - found final passages : ", numfinal, "\n")
1210
+ endRerank2 = time.time()
1211
+ hours, rem = divmod(endRerank2 - startRerank2, 3600)
1212
+ minutes, seconds = divmod(rem, 60)
1213
+ print("Rerank2 Time... {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds))
1214
+ #
1215
 
1216
+ except Exception as err:
1217
+ print("SOMETHING HAPPENED on PASSAGE RERANKING for Question :"+questionText+"\n")
1218
+ print(err)
1219
+ #status_code: 422, body: type='validation_error' url='https://www.mixedbread.ai/api-reference' message='Your request is invalid. Please check your input and try again.' details=[[{'type': 'too_long', 'loc': ['body', 'input', 'list[str]'], 'msg': 'List should have at most 1000 items after validation, not 4249',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1220
 
1221
 
1222
  else: # NO RAG on triples
 
1702
 
1703
  if entityBioeUrl:
1704
 
1705
+ if strtobool(args.computeEntityContext) and (strtobool(args.computeEntityGlobalContext)==False):
1706
 
1707
  if strtobool(args.debug):
1708
  print("START computeEntityContext")
 
1837
 
1838
 
1839
  if not globalContext:
1840
+
1841
+ BreakenBeforeAll = False
1842
  if unique_listGlobalTriples:
1843
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word, unique_listGlobalTriples,
1844
  text_splitter, args,
 
1883
  return None, None, None, None, None, None, cache_map_virtuoso, load_map_query_input_output
1884
 
1885
 
1886
+
1887
  if not ALLURIScontext:
1888
  # Print the error message to stderr
1889
  print("THIS CASE SHOULD NEVER HAPPEN NOW!!!! Check what's happening...exiting now...")
 
2013
  if unique_listLabelTriples:
2014
  unique_listGlobalTriples.extend(unique_listLabelTriples)
2015
 
2016
+
2017
+ # This is to speed up, so I break here the global, but in this case I will not store the triples for the other uris in the cache, which maybe useful in the future
2018
+ # #if token_counter(str(unique_listGlobalTriples),args.model_name) > args.tokens_max:
2019
+
2020
+ if getattr(args, 'maxTriplesContextComputation', None): #it means it exists
2021
+ if args.maxTriplesContextComputation > 0:
2022
+ if len(unique_listGlobalTriples) > args.maxTriplesContextComputation:
2023
+ unique_listGlobalTriples = unique_listGlobalTriples[:args.maxTriplesContextComputation]
2024
+ BreakenBeforeAll = True
2025
+ break # BREAK THE FOR LOOP IF THE GLOBAL CONTEXT IS ALREADY TOO BIG, BIGGER THAN tokens_max
2026
 
2027
 
2028
  if unique_listGlobalTriples:
 
2033
  if cache_map_virtuoso is not None:
2034
  if not word in cache_map_virtuoso:
2035
  cache_map_virtuoso[word] = {}
2036
+ if BreakenBeforeAll == False:
2037
+ cache_map_virtuoso[word][("GlobalTriples"+" "+contextWordVirtuoso).strip()] = unique_listGlobalTriples
2038
 
2039
  globalContext, load_map_query_input_output = getLinearTextualContextFromTriples(word,
2040
  unique_listGlobalTriples,
 
2045
  if cache_map_virtuoso is not None:
2046
  if not word in cache_map_virtuoso:
2047
  cache_map_virtuoso[word] = {}
2048
+ if BreakenBeforeAll == False:
2049
+ cache_map_virtuoso[word][("GlobalContext"+" "+contextWordVirtuoso).strip()] = globalContext
2050
 
2051
  if unique_listLabelTriples:
2052
  sssingleTriples = " ,., ".join(
 
2434
 
2435
  parser.add_argument("--computeEntityContext", type=str, default="False", help="whether to extract a readable context from the extracted triples for the concept")
2436
  parser.add_argument("--computeEntityGlobalContext", type=str, default="False", help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept")
2437
+ parser.add_argument("--maxTriplesContextComputation", type=int, default=20000,
2438
+ help="maximum number of triples to consider for global context computation") # if 0 or None it is not considered
2439
  parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True",
2440
  help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs")
2441
 
retrieverRAG_SF.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # https://www.mixedbread.ai/blog/mxbai-embed-large-v1
3
+ # https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
4
+
5
+ import os
6
+ import time
7
+ import pandas as pd
8
+ import numpy as np
9
+ from typing import Dict
10
+
11
+ import torch
12
+ from transformers import AutoModel, AutoTokenizer
13
+ from sentence_transformers.util import cos_sim
14
+ from accelerate import Accelerator # Import from accelerate
15
+ from scipy.stats import zscore
16
+
17
+ # Set up environment variables for Hugging Face caching
18
+ os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
19
+ os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
20
+ os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub"
21
+
22
+ # Initialize the Accelerator
23
+ accelerator = Accelerator()
24
+
25
+ # Use the device managed by Accelerator
26
+ device = accelerator.device
27
+ print("Using accelerator device =", device)
28
+
29
+
30
+ from sentence_transformers import CrossEncoder
31
+ model_sf_mxbai = CrossEncoder("mixedbread-ai/mxbai-rerank-large-v1" ,device=device)
32
+
33
+
34
+
35
+
36
+
37
+ def RAG_retrieval_Base(queryText ,passages, min_threshold=0.0, max_num_passages=None):
38
+
39
+ # # Example query
40
+ # query = "What is the capital of France?"
41
+ #
42
+ # # Example passages
43
+ # ppppassages = [
44
+ # "This is the first passage.",
45
+ # "The capital of France is Paris.",
46
+ # "This is the third passage.",
47
+ # "Paris is a beautiful city.",
48
+ # "The Eiffel Tower is in Paris."
49
+ # ]
50
+ #
51
+ # # Rank the passages with respect to the query
52
+ # ranked_passages = model_sf_mxbai.rank(query, ppppassages)
53
+
54
+ try:
55
+
56
+ df_filtered = pd.DataFrame()
57
+
58
+ if max_num_passages:
59
+ result_rerank = model_sf_mxbai.rank(queryText, passages, return_documents=False, top_k=max_num_passages)
60
+ else:
61
+ nback =int(0.1 *len(passages)) # 10% of the number of passages
62
+ if nback<=0:
63
+ nback=1
64
+ result_rerank = model_sf_mxbai.rank(queryText, passages, return_documents=False, top_k=nback)
65
+
66
+ if result_rerank:
67
+ df = pd.DataFrame(result_rerank) # corpus_id, score
68
+
69
+ if min_threshold >0:
70
+ df_filtered = df[df['score'] >= min_threshold]
71
+ else:
72
+ df_filtered =df.copy()
73
+
74
+ selected_passages = [passages[i] for i in df_filtered['corpus_id']]
75
+
76
+ # Add the selected passages as a new column "Passage" to the DataFrame
77
+ df_filtered['Passage'] = selected_passages
78
+
79
+ df_filtered = df_filtered.drop_duplicates(subset='Passage', keep='first')
80
+
81
+ # df_filtered = df_filtered.sort_values(by='score', ascending=False)
82
+
83
+ # Return the filtered DataFrame
84
+ return df_filtered
85
+
86
+ except Exception as e:
87
+ # Log the exception message or handle it as needed
88
+ print(f"An error occurred: {e}")
89
+ return pd.DataFrame() # Return an empty DataFrame in case of error
90
+
91
+
92
+
93
+
94
+
95
+ if __name__ == '__main__':
96
+
97
+ queryText = 'A man is eating a piece of bread'
98
+
99
+ # Define the passages list
100
+ passages = [
101
+ "A man is eating food.",
102
+ "A man is eating pasta.",
103
+ "The girl is carrying a baby.",
104
+ "A man is riding a horse.",
105
+ ]
106
+
107
+ df_retrieved = RAG_retrieval_Base(queryText, passages, min_threshold=0, max_num_passages=3)
108
+
109
+
110
+ print(df_retrieved)
111
+
112
+
113
+ print("end of computations")
114
+
virtuosoQueryRest.py CHANGED
@@ -3,10 +3,14 @@ from requests.auth import HTTPDigestAuth, HTTPBasicAuth
3
  import ssl
4
  import json
5
 
 
6
 
 
 
7
 
8
 
9
 
 
10
  def execute_query(endpoint, query, auth):
11
  headers = {
12
  'Content-Type': 'application/x-www-form-urlencoded',
 
3
  import ssl
4
  import json
5
 
6
+ from joblib import Memory
7
 
8
+ cachedir = 'cached'
9
+ mem = Memory(cachedir, verbose=False)
10
 
11
 
12
 
13
+ @mem.cache
14
  def execute_query(endpoint, query, auth):
15
  headers = {
16
  'Content-Type': 'application/x-www-form-urlencoded',