import gradio as gr import numpy as np from usearch.index import Index from sentence_transformers import SentenceTransformer from datasets import load_dataset from sentencex import segment from usearch.index import search, MetricKind, Matches, BatchMatches import csv HTML_Output = """
""" model = SentenceTransformer("Corran/SciGenNomicEmbed",trust_remote_code=True) rf = load_dataset("Corran/RhetoricFunctionsList")['train']['rhetoric_function'] rf = list(rf) rf_emb = model.encode(rf) def get_matches(inputs): global index, model, rf paragraph_matches = [] for input in inputs: embs = model.encode(input,batch_size=128) matches = search(rf_emb, embs, 3, MetricKind.L2sq, exact=True) sentence_matches = [] for match_ in matches: sentence_matches.append((rf[match_.key],str(round(match_.distance,2)))) paragraph_matches.append(sentence_matches) return paragraph_matches def return_rf_scores(abstract): sentences = list(segment("en", abstract)) matches = get_matches(sentences) output = HTML_Output for s,m in zip(sentences,matches): tooltip = [f"{mm[0]} : {mm[1]})