FDSRashid commited on
Commit
0627860
·
verified ·
1 Parent(s): 3bbd7f9

added cross encoder

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -7,6 +7,7 @@ from datasets import Features
7
  from datasets import Value
8
  from datasets import Dataset
9
  from sentence_transformers import SentenceTransformer
 
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  import os
12
  import gradio as gr
@@ -39,14 +40,20 @@ df = joined_df.copy()
39
 
40
 
41
  model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
 
42
  arr = np.array(df['embed'].to_list())
43
 
44
  def find_most_similar_matn(text, n):
45
- embed_text = model.encode(araby.strip_diacritics(text))
 
46
  cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
47
  indices = np.argsort(cos_sim)[0][-n:]
48
  matns = df.iloc[indices]
49
  matns['Similarity'] = cos_sim[0][indices]
 
 
 
 
50
  return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]
51
 
52
  with gr.Blocks() as demo:
 
7
  from datasets import Value
8
  from datasets import Dataset
9
  from sentence_transformers import SentenceTransformer
10
+ from sentence_transformers.cross_encoder import CrossEncoder
11
  from sklearn.metrics.pairwise import cosine_similarity
12
  import os
13
  import gradio as gr
 
40
 
41
 
42
  model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
43
+ model_CE = CrossEncoder('FDSRashid/QulBERT-CE-2.0', token=Secret_token)
44
  arr = np.array(df['embed'].to_list())
45
 
46
  def find_most_similar_matn(text, n):
47
+ prep_text = araby.strip_diacritics(text)
48
+ embed_text = model.encode(prep_text)
49
  cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
50
  indices = np.argsort(cos_sim)[0][-n:]
51
  matns = df.iloc[indices]
52
  matns['Similarity'] = cos_sim[0][indices]
53
+ matns_prep = [araby.strip_diacritics(text) for text in matns['matn']]
54
+ to_compare = [(i, prep_text) for i in matns_prep]
55
+ is_taraf = model_CE.predict(to_compare)
56
+ matns = matns[is_taraf> .5]
57
  return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]
58
 
59
  with gr.Blocks() as demo: