Spaces:

Deepak7376
/

DocChatAI

Running

App Files Files Community

Deepak Yadav commited on Feb 26

Commit

7f98036

1 Parent(s): 7cf558f

replaced the llm model with gguf format model

Browse files

Files changed (4) hide show

.gitignore +2 -0
app.py +23 -11
requirements.txt +2 -1
services/llm.py +14 -7

.gitignore CHANGED Viewed

@@ -1,4 +1,6 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # Byte-compiled / optimized / DLL files
+docs/
+myenv/
 __pycache__/
 *.py[cod]
 *$py.class

app.py CHANGED Viewed

@@ -7,12 +7,12 @@ from services.pdf_processing import load_and_split_pdf
 from utils.helpers import extract_thoughts, response_generator
 import subprocess
-try:
-    print("🚀 Checking and starting Ollama...")
-    subprocess.run(["bash", "install_ollama.sh"], check=True)
-    print("✅ Ollama is running!")
-except subprocess.CalledProcessError as e:
-    print(f"❌ Error: {e}")
 # Custom CSS for chat styling
@@ -70,7 +70,7 @@ st.sidebar.write("---")
 # Hyperparameters
 temperature = st.sidebar.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
 top_p = st.sidebar.slider("Top-p (Nucleus Sampling)", 0.0, 1.0, 0.9, 0.05)
-max_tokens = st.sidebar.number_input("Max Tokens", 10, 2048, 256, 10)
 st.sidebar.write("---")
 # File Upload
@@ -111,12 +111,15 @@ if "messages" not in st.session_state:
 # Display previous messages
 for message in st.session_state.messages:
     with st.chat_message(message["role"]):
         st.markdown(message["content"])
 # Chat Input
 if user_input := st.chat_input("💬 Ask something..."):
-    st.session_state.messages.append({"role": "user", "content": user_input})
     with st.chat_message("user"):
         st.markdown(user_input)
@@ -127,13 +130,22 @@ if user_input := st.chat_input("💬 Ask something..."):
     # Generate response
     context = retrive_vector_store(retriever, user_input) if retriever else "No context"
     query = generate_prompt(context=context, question=user_input)
-    response = llm.invoke(query)
     # Calculate response time
     response_time = round(time.time() - start_time, 2)
     # Extract thoughts and main answer
-    thinking_part, main_answer = extract_thoughts(response)
     # Display AI response
     with st.chat_message("assistant"):
@@ -150,4 +162,4 @@ if user_input := st.chat_input("💬 Ask something..."):
         st.markdown(formatted_response, unsafe_allow_html=True)
     # Save to session history
-    st.session_state.messages.append({"role": "assistant", "content": formatted_response})

 from utils.helpers import extract_thoughts, response_generator
 import subprocess
+# try:
+#     print("🚀 Checking and starting Ollama...")
+#     subprocess.run(["bash", "install_ollama.sh"], check=True)
+#     print("✅ Ollama is running!")
+# except subprocess.CalledProcessError as e:
+#     print(f"❌ Error: {e}")
 # Custom CSS for chat styling
 # Hyperparameters
 temperature = st.sidebar.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
 top_p = st.sidebar.slider("Top-p (Nucleus Sampling)", 0.0, 1.0, 0.9, 0.05)
+max_tokens = st.sidebar.number_input("Max Tokens", 10, 2048, 1024, 10)
 st.sidebar.write("---")
 # File Upload
 # Display previous messages
 for message in st.session_state.messages:
+    if message['thinking_part']:
+        with st.expander("💭 Thought Process"):
+                st.markdown(message['thinking_part'])
     with st.chat_message(message["role"]):
         st.markdown(message["content"])
 # Chat Input
 if user_input := st.chat_input("💬 Ask something..."):
+    st.session_state.messages.append({"role": "user", "content": user_input, "thinking_part": False})
     with st.chat_message("user"):
         st.markdown(user_input)
     # Generate response
     context = retrive_vector_store(retriever, user_input) if retriever else "No context"
     query = generate_prompt(context=context, question=user_input)
+    # response = llm.invoke(query)
+    response = llm.create_chat_completion(
+	messages = [
+		{
+			"role": "user",
+			"content": f"{query}"
+		}
+            ]
+        )
     # Calculate response time
     response_time = round(time.time() - start_time, 2)
     # Extract thoughts and main answer
+    thinking_part, main_answer = extract_thoughts(response['choices'][0]['message']['content'])
     # Display AI response
     with st.chat_message("assistant"):
         st.markdown(formatted_response, unsafe_allow_html=True)
     # Save to session history
+    st.session_state.messages.append({"role": "assistant", "content": formatted_response, "thinking_part": thinking_part})

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ faiss-cpu
 pymupdf
 ollama
 langchain_ollama
-langchain_huggingface

 pymupdf
 ollama
 langchain_ollama
+langchain_huggingface
+llama-cpp-python

services/llm.py CHANGED Viewed

@@ -1,17 +1,24 @@
 from langchain_ollama import OllamaLLM
 from langchain_huggingface import HuggingFaceEmbeddings
 import streamlit as st
 @st.cache_resource
 def initialize_llm(model_name, temperature, top_p, max_tokens):
-    # Configure the LLM with additional parameters
-    llm = OllamaLLM(
-        model=model_name,
-        base_url="https://deepak7376-ollama-server.hf.space",
-        temperature=temperature,   # Controls randomness (0 = deterministic, 1 = max randomness)
-        max_tokens=max_tokens,   # Limit the number of tokens in the output
-        top_p=top_p          # Nucleus sampling for controlling diversity
     )
     return llm

 from langchain_ollama import OllamaLLM
+from llama_cpp import Llama
 from langchain_huggingface import HuggingFaceEmbeddings
 import streamlit as st
 @st.cache_resource
 def initialize_llm(model_name, temperature, top_p, max_tokens):
+    # # Configure the LLM with additional parameters
+    # llm = OllamaLLM(
+    #     model=model_name,
+    #     base_url="https://deepak7376-ollama-server.hf.space",
+    #     temperature=temperature,   # Controls randomness (0 = deterministic, 1 = max randomness)
+    #     max_tokens=max_tokens,   # Limit the number of tokens in the output
+    #     top_p=top_p          # Nucleus sampling for controlling diversity
+    # )
+    llm = Llama.from_pretrained(
+	repo_id="bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
+	filename="DeepSeek-R1-Distill-Qwen-1.5B-IQ4_XS.gguf",
+    n_ctx=max_tokens
     )
     return llm