mohamedashraf11 commited on
Commit
b2718af
ยท
verified ยท
1 Parent(s): 7c6d871

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -36
app.py CHANGED
@@ -1,14 +1,17 @@
1
- # Necessary imports
2
- from langchain.vectorstores import Chroma
3
- from langchain.embeddings import HuggingFaceEmbeddings
4
  from langchain.text_splitter import CharacterTextSplitter
5
  from langchain.prompts import PromptTemplate
6
  from langchain.chains.question_answering import load_qa_chain
7
  from datasets import load_dataset
8
  import pandas as pd
9
  from functools import lru_cache
10
- from huggingface_hub import InferenceClient
11
  import gradio as gr
 
 
 
 
12
 
13
  # Initialize the Hugging Face Inference Client
14
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
@@ -17,34 +20,26 @@ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
17
  dataset = load_dataset('arbml/LK_Hadith')
18
  df = pd.DataFrame(dataset['train'])
19
 
20
- # Filter data (Only retain Hadiths with non-weak grades)
21
  filtered_df = df[df['Arabic_Grade'] != 'ุถุนูŠู']
22
  documents = list(filtered_df['Arabic_Matn'])
23
  metadatas = [{"Hadith_Grade": grade} for grade in filtered_df['Arabic_Grade']]
24
 
25
- # Text splitter (using a smaller chunk size for memory efficiency)
26
- text_splitter = CharacterTextSplitter(chunk_size=1000)
27
  nltk_chunks = text_splitter.create_documents(documents, metadatas=metadatas)
28
 
29
- # LLM (Replace Ollama with a Hugging Face Hub model)
30
- from langchain.llms import HuggingFaceHub
31
- llm = HuggingFaceHub(repo_id="salmatrafi/acegpt:7b")
32
 
33
- # Create an embedding model (Hugging Face transformer model for embeddings)
34
- embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
35
 
36
- # Generate document embeddings
37
  docs_text = [doc.page_content for doc in nltk_chunks]
38
- try:
39
- docs_embedding = embeddings.embed_documents(docs_text)
40
- except Exception as e:
41
- print(f"Error in embedding generation: {str(e)}")
42
 
43
- # Create Chroma vector store with embeddings
44
- try:
45
- vector_store = Chroma.from_documents(nltk_chunks, embedding=embeddings)
46
- except Exception as e:
47
- print(f"Error in creating vector store: {str(e)}")
48
 
49
  # Question answering prompt template
50
  qna_template = "\n".join([
@@ -141,19 +136,16 @@ def respond(
141
 
142
  response = ""
143
 
144
- try:
145
- for msg in client.chat_completion(
146
- messages,
147
- max_tokens=max_tokens,
148
- stream=True,
149
- temperature=temperature,
150
- top_p=top_p,
151
- ):
152
- token = msg.choices[0].delta.content
153
- response += token
154
- yield response
155
- except Exception as e:
156
- yield f"An error occurred during chat completion: {str(e)}"
157
 
158
  # Gradio Chat Interface
159
  demo = gr.ChatInterface(
@@ -172,6 +164,5 @@ demo = gr.ChatInterface(
172
  ],
173
  )
174
 
175
- # Launch the Gradio interface
176
  if __name__ == "__main__":
177
  demo.launch()
 
1
+ from langchain_community.llms import HuggingFaceHub
2
+ from langchain_community.vectorstores import Chroma
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
  from langchain.text_splitter import CharacterTextSplitter
5
  from langchain.prompts import PromptTemplate
6
  from langchain.chains.question_answering import load_qa_chain
7
  from datasets import load_dataset
8
  import pandas as pd
9
  from functools import lru_cache
 
10
  import gradio as gr
11
+ from huggingface_hub import InferenceClient
12
+
13
+ # Ensure you have set your Hugging Face API token here or as an environment variable
14
+ HUGGINGFACEHUB_API_TOKEN = "your_huggingface_api_token" # Replace with your actual Hugging Face token
15
 
16
  # Initialize the Hugging Face Inference Client
17
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
20
  dataset = load_dataset('arbml/LK_Hadith')
21
  df = pd.DataFrame(dataset['train'])
22
 
23
+ # Filter data
24
  filtered_df = df[df['Arabic_Grade'] != 'ุถุนูŠู']
25
  documents = list(filtered_df['Arabic_Matn'])
26
  metadatas = [{"Hadith_Grade": grade} for grade in filtered_df['Arabic_Grade']]
27
 
28
+ # Use CharacterTextSplitter
29
+ text_splitter = CharacterTextSplitter(chunk_size=10000)
30
  nltk_chunks = text_splitter.create_documents(documents, metadatas=metadatas)
31
 
32
+ # LLM - Using HuggingFaceHub with API token
33
+ llm = HuggingFaceHub(repo_id="salmatrafi/acegpt:7b", huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN)
 
34
 
35
+ # Create an embedding model
36
+ embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base", huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN)
37
 
 
38
  docs_text = [doc.page_content for doc in nltk_chunks]
39
+ docs_embedding = embeddings.embed_documents(docs_text)
 
 
 
40
 
41
+ # Create Chroma vector store
42
+ vector_store = Chroma.from_documents(nltk_chunks, embedding=embeddings)
 
 
 
43
 
44
  # Question answering prompt template
45
  qna_template = "\n".join([
 
136
 
137
  response = ""
138
 
139
+ for msg in client.chat_completion(
140
+ messages,
141
+ max_tokens=max_tokens,
142
+ stream=True,
143
+ temperature=temperature,
144
+ top_p=top_p,
145
+ ):
146
+ token = msg.choices[0].delta.content
147
+ response += token
148
+ yield response
 
 
 
149
 
150
  # Gradio Chat Interface
151
  demo = gr.ChatInterface(
 
164
  ],
165
  )
166
 
 
167
  if __name__ == "__main__":
168
  demo.launch()