import pyarabic.araby as araby import pandas as pd import numpy as np import re from datasets import load_dataset from datasets import Features from datasets import Value from datasets import Dataset from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import os import gradio as gr Secret_token = os.getenv('HF_token') dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token) books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas() df = dataset["train"].to_pandas() features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')}) dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features) matn_info = dataset['train'].to_pandas() matn_info = matn_info.drop(97550) matn_info = matn_info.drop(307206) matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1) matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int) matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0])) matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1])) matn_info = pd.merge(matn_info, books, on='Book_ID') cols_to_use = df.columns.difference(matn_info.columns) joined_df = matn_info.merge(df[cols_to_use], left_index=True, right_on='__index_level_0__') df = joined_df.copy() model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token) arr = np.array(df['embed'].to_list()) def find_most_similar_matn(text, n): embed_text = model.encode(araby.strip_diacritics(text)) cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr) indices = np.argsort(cos_sim)[0][-n:] matns = df.iloc[indices] matns['Similarity'] = cos_sim[0][indices] return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']] with gr.Blocks() as demo: text_input = gr.Textbox() num_hadith = gr.Slider(1, 50, value = 5, label = 'Num Hadith', info = 'Choose the number of Hadith to Return') text_output = gr.DataFrame() text_button = gr.Button("Retrieve") text_button.click(find_most_similar_matn, inputs=[text_input, num_hadith], outputs=text_output) demo.launch()