Adr740 commited on
Commit
8a2e2aa
·
1 Parent(s): 06ce63f

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +46 -0
  2. data.py +2 -0
  3. get_hadiths.py +67 -0
  4. pickle_ebd.pkl +3 -0
  5. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from get_hadiths import HadithSearch
3
+ import os
4
+ from functools import partial
5
+
6
+ hadith_search = HadithSearch(api_key=os.environ.get("apk"))
7
+
8
+ title = "Smart Hadith"
9
+ desc = "### This is a simple AI tool. Write your problem or situation (personal or not) and quickly find relevant hadiths on this topic or a problem you have. Just type in plain English how you feel. Contact suggestions/questions: [email protected]\n\n"
10
+
11
+
12
+ warning = "Warning!\n **PLEASE READ THE DISCLAIMER BELOW** This isn't a 100% accurate tool; not all Hadiths are present in the database, and some results might be repetitive. If that's the case, try generating more Hadiths with the selector.\nMore informations describing how the tool works are coming soon\n\n## DISCLAIMER\n\nTHIS TOOL IS INTENDED FOR REFERENCE PURPOSES ONLY AND IS NOT INTENDED TO BE TAKEN AS RELIGIOUS ADVICE. THE HADITHS DISPLAYED BY THIS TOOL ARE NOT INTENDED TO BE USED AS A SOLE SOURCE OF RELIGIOUS GUIDANCE. USERS ARE RESPONSIBLE FOR CONDUCTING THEIR OWN RESEARCH AND SEEKING GUIDANCE FROM RELIGIOUS SCHOLARS.\n\nPLEASE NOTE THAT THE CONTENT DISPLAYED BY THIS TOOL IS NOT GUARANTEED TO BE ACCURATE, COMPLETE, OR UP-TO-DATE.\n\nTHE DEVELOPERS OF THIS TOOL WILL NOT BE HELD RESPONSIBLE FOR ANY DECISIONS MADE BY THE USERS OF THIS TOOL THAT ARE BASED ON THE CONTENT DISPLAYED BY THIS TOOL.\n\nHadiths gathered from this repository: https:\/\/www.kaggle.com\/datasets\/fahd09\/hadith-dataset"
13
+
14
+ def update_results(text_area, number_to_display=10):
15
+ results = hadith_search.search_hadiths(text_area, number_to_display)
16
+ return results
17
+
18
+
19
+ with gr.Blocks(title=title) as demo:
20
+ gr.Markdown(f"## {title}")
21
+ gr.Markdown(desc)
22
+
23
+ with gr.Accordion("READ BEFORE USING THE TOOL!", open=False):
24
+ gr.Markdown(warning)
25
+
26
+ with gr.Row():
27
+ with gr.Column(scale=4):
28
+ text_area = gr.Textbox(placeholder="I heard it was advised to drink water in three segments", lines=3, label="Describe your situation/problem with your own words (story, keywords, topics, examples etc...)")
29
+ with gr.Column(scale=1):
30
+ number_to_display = gr.Number(value=10,label = "Number of hadiths to display")
31
+ submit_button = gr.Button(value="Find hadiths relevant to my story")
32
+ pass
33
+
34
+ fn = partial(update_results)
35
+
36
+ with gr.Accordion("Relevant Hadiths (Rerun if results are not satisfying"):
37
+ response_display = gr.Markdown("Empty")
38
+
39
+ submit_button.click(fn=fn, inputs=[text_area,number_to_display], outputs=[response_display])
40
+ demo.launch(max_threads=40)
41
+
42
+
43
+
44
+
45
+
46
+
data.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import pandas as pd
2
+ data = pd.read_pickle("pickle_ebd.pkl")
get_hadiths.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import numpy as np
4
+ from openai import OpenAI
5
+ from data import data as df
6
+
7
+ class HadithSearch:
8
+ def __init__(self, api_key):
9
+ self.client = OpenAI(api_key=api_key)
10
+ self.data = df
11
+
12
+ def _cosine_similarity(self, a, b):
13
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
14
+
15
+ def _get_embedding(self, text, model="text-embedding-ada-002"):
16
+ try:
17
+ text = text.replace("\n", " ")
18
+ except Exception as e:
19
+ pass
20
+ response = self.client.chat.completions.create(
21
+ model="gpt-3.5-turbo",
22
+ messages=[
23
+ {
24
+ "role": "system",
25
+ "content": "Your task is to transform a described situation into a list of the top 3 most important things to look for in a database of Islamic hadith that could be helpful to bring answers. \n\nIt should be very specific and formatted with only the list and remove all occurences of the word 'Hadiths', just the topics sought. JSON FORMAT!\n\nThe goal is to use this list to perform cosine similarity embedding search on the hadith database."
26
+ },
27
+ {
28
+ "role": "user",
29
+ "content": text
30
+ }
31
+ ],
32
+ temperature=1,
33
+ max_tokens=684,
34
+ top_p=1,
35
+ frequency_penalty=0,
36
+ presence_penalty=0
37
+ ).choices[0].message.content
38
+ return self.client.embeddings.create(input=f"{response}", model=model).data[0].embedding
39
+
40
+ def search_hadiths(self, user_input, num_hadiths=10):
41
+ if self.data is None:
42
+ raise ValueError("Data not loaded.")
43
+
44
+ embedding_column_name = "embeddings"
45
+ try:
46
+ self.data[embedding_column_name] = self.data.embeddings.apply(lambda x: x["embeding"])
47
+ except Exception as e:
48
+ pass
49
+
50
+ embedding = self._get_embedding(user_input, model='text-embedding-ada-002')
51
+ self.data['similarities'] = self.data.embeddings.apply(lambda x: self._cosine_similarity(x, embedding))
52
+
53
+ results = self.data.sort_values('similarities', ascending=False).head(int(num_hadiths)).to_dict(orient="records")
54
+ formatted_results = self._format_results(results)
55
+ return formatted_results
56
+
57
+ def _format_results(self, results):
58
+ for r in results:
59
+ formatted_output = ""
60
+ formatted_output += "### Source: " + str(r["source"]) + " | Chapter name : "+ str(r["chapter"]) +" | Chapter number: " + str(r["chapter_no"]) + " | Hadith number : " + str(r["chapter_no"]) + "\n\n"
61
+ formatted_output += "Similarity with query: " + str(round(r["similarities"]*100,2)) + "%" +" | Chain index: " + str(r["chain_indx"]) + "\n\n"
62
+ formatted_output += "### Hadith content:" + "\n\n" + str(r["text_en"]) + "\n\n"
63
+ formatted_output += "Arabic version: \n\n" + str(r["text_ar"])
64
+ formatted_output += "\n\n-----------------------------------------------------------------------------------------------------\n\n"
65
+ formatted_output = formatted_output.replace("`", "")
66
+ return formatted_output
67
+
pickle_ebd.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0d207391f0488a92360e7f0365c51b0888a5281220d05602982e2027773f92b
3
+ size 502886347
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ openai
2
+ gradio
3
+ pandas
4
+ numpy