Spaces:

FDSRashid
/

Taraf_Semantic_Similarity_Visualizer

Running

File size: 2,510 Bytes

a110314
 
 
 
 
 
913f06b
d199ed5
a110314
d199ed5
 
 
 
bdfb520
a110314
bdfb520
0da7061
a110314
 
c068b81
 
 
 
 
8f36d42
c068b81
cc6b2ae
37a7276
cc6b2ae
ab0e6d7
 
 
 
16bb533
ab0e6d7
c068b81
16bb533
c068b81
913f06b
fbd7946
9412d3c
fbd7946
913f06b
 
9bbd6fb
 
 
 
 
1f77093
d199ed5
a110314
 
913f06b
a110314
 
5ad39da
a110314


import numpy as np
import gradio as gr
import os
import pandas as pd
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Features, Value
import plotly.express as px

features = Features({'matn': Value('string'), 'taraf_ID': Value('string'), 'bookid_hadithid': Value('string')})


Secret_token = os.getenv('HF_token')

dataset = load_dataset("FDSRashid/embed_matn", token = Secret_token)
books = load_dataset('FDSRashid/Hadith_info', data_files='Books.csv', token=Secret_token)['train'].to_pandas()
df = dataset["train"].to_pandas()

dataset = load_dataset("FDSRashid/hadith_info", data_files = 'All_Matns.csv',token = Secret_token, features = features)
matn_info = dataset['train'].to_pandas()
matn_info = matn_info.drop(97550)
matn_info = matn_info.drop(307206)
matn_info['taraf_ID'] = matn_info['taraf_ID'].replace('KeyAbsent', -1)
matn_info['taraf_ID'] = matn_info['taraf_ID'].astype(int)

matn_info['Book_ID'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[0]))
matn_info['Hadith Number'] = matn_info['bookid_hadithid'].apply(lambda x: int(x.split('_')[1]))
matn_info = pd.merge(matn_info, books, on='Book_ID')


matn_info = matn_info.reset_index()
df = df.reset_index()
cols_to_use = df.columns.difference(matn_info.columns)
joined_df = pd.merge(matn_info,df[cols_to_use],left_index=True, right_index=True)
df = joined_df.copy()
taraf_max = np.max(df['taraf_ID'].unique())

def plot_similarity_score(taraf_num):
    taraf_df = df[df['taraf_ID']== taraf_num]
    taraf_df['Number'] = np.arange(len(taraf_df))
    embed_taraf = taraf_df['embed'].to_list()
    cos_score = cosine_similarity(embed_taraf)
    fig = px.imshow(cos_score)
    matr = cos_score
    rows, cols = matr.shape
    mask = np.tril(np.ones((rows, cols), dtype=bool), k=-1)
    lower_triangle = matr[mask]
    data = lower_triangle.flatten()
    fig_dis = px.histogram(x = data,  title = f'Similarity Distribution for Taraf {taraf_num}', labels = {'x': 'Similarity Score'}, nbins = 20, template = 'ggplot2' )
    return fig, fig_dis, taraf_df[['matn', 'Number', 'Book_Name', 'Author', 'Hadith Number']]

with gr.Blocks() as demo: 
  gr.Markdown('# Semantic Similarity  Visualizer')
  taraf_number = gr.Slider(1,taraf_max , value=10000, label="Taraf", info="Choose the Taraf to Input", step = 1)
  btn = gr.Button('Submit')
  btn.click(fn = plot_similarity_score, inputs = [taraf_number], outputs = [gr.Plot(),gr.Plot(), gr.DataFrame(wrap=True)])
  demo.launch()