Deepak Yadav commited on
Commit
7f98036
Β·
1 Parent(s): 7cf558f

replaced the llm model with gguf format model

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +23 -11
  3. requirements.txt +2 -1
  4. services/llm.py +14 -7
.gitignore CHANGED
@@ -1,4 +1,6 @@
1
  # Byte-compiled / optimized / DLL files
 
 
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
 
1
  # Byte-compiled / optimized / DLL files
2
+ docs/
3
+ myenv/
4
  __pycache__/
5
  *.py[cod]
6
  *$py.class
app.py CHANGED
@@ -7,12 +7,12 @@ from services.pdf_processing import load_and_split_pdf
7
  from utils.helpers import extract_thoughts, response_generator
8
  import subprocess
9
 
10
- try:
11
- print("πŸš€ Checking and starting Ollama...")
12
- subprocess.run(["bash", "install_ollama.sh"], check=True)
13
- print("βœ… Ollama is running!")
14
- except subprocess.CalledProcessError as e:
15
- print(f"❌ Error: {e}")
16
 
17
 
18
  # Custom CSS for chat styling
@@ -70,7 +70,7 @@ st.sidebar.write("---")
70
  # Hyperparameters
71
  temperature = st.sidebar.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
72
  top_p = st.sidebar.slider("Top-p (Nucleus Sampling)", 0.0, 1.0, 0.9, 0.05)
73
- max_tokens = st.sidebar.number_input("Max Tokens", 10, 2048, 256, 10)
74
  st.sidebar.write("---")
75
 
76
  # File Upload
@@ -111,12 +111,15 @@ if "messages" not in st.session_state:
111
 
112
  # Display previous messages
113
  for message in st.session_state.messages:
 
 
 
114
  with st.chat_message(message["role"]):
115
  st.markdown(message["content"])
116
 
117
  # Chat Input
118
  if user_input := st.chat_input("πŸ’¬ Ask something..."):
119
- st.session_state.messages.append({"role": "user", "content": user_input})
120
 
121
  with st.chat_message("user"):
122
  st.markdown(user_input)
@@ -127,13 +130,22 @@ if user_input := st.chat_input("πŸ’¬ Ask something..."):
127
  # Generate response
128
  context = retrive_vector_store(retriever, user_input) if retriever else "No context"
129
  query = generate_prompt(context=context, question=user_input)
130
- response = llm.invoke(query)
 
 
 
 
 
 
 
 
 
131
 
132
  # Calculate response time
133
  response_time = round(time.time() - start_time, 2)
134
 
135
  # Extract thoughts and main answer
136
- thinking_part, main_answer = extract_thoughts(response)
137
 
138
  # Display AI response
139
  with st.chat_message("assistant"):
@@ -150,4 +162,4 @@ if user_input := st.chat_input("πŸ’¬ Ask something..."):
150
  st.markdown(formatted_response, unsafe_allow_html=True)
151
 
152
  # Save to session history
153
- st.session_state.messages.append({"role": "assistant", "content": formatted_response})
 
7
  from utils.helpers import extract_thoughts, response_generator
8
  import subprocess
9
 
10
+ # try:
11
+ # print("πŸš€ Checking and starting Ollama...")
12
+ # subprocess.run(["bash", "install_ollama.sh"], check=True)
13
+ # print("βœ… Ollama is running!")
14
+ # except subprocess.CalledProcessError as e:
15
+ # print(f"❌ Error: {e}")
16
 
17
 
18
  # Custom CSS for chat styling
 
70
  # Hyperparameters
71
  temperature = st.sidebar.slider("Temperature", 0.0, 1.0, 0.7, 0.1)
72
  top_p = st.sidebar.slider("Top-p (Nucleus Sampling)", 0.0, 1.0, 0.9, 0.05)
73
+ max_tokens = st.sidebar.number_input("Max Tokens", 10, 2048, 1024, 10)
74
  st.sidebar.write("---")
75
 
76
  # File Upload
 
111
 
112
  # Display previous messages
113
  for message in st.session_state.messages:
114
+ if message['thinking_part']:
115
+ with st.expander("πŸ’­ Thought Process"):
116
+ st.markdown(message['thinking_part'])
117
  with st.chat_message(message["role"]):
118
  st.markdown(message["content"])
119
 
120
  # Chat Input
121
  if user_input := st.chat_input("πŸ’¬ Ask something..."):
122
+ st.session_state.messages.append({"role": "user", "content": user_input, "thinking_part": False})
123
 
124
  with st.chat_message("user"):
125
  st.markdown(user_input)
 
130
  # Generate response
131
  context = retrive_vector_store(retriever, user_input) if retriever else "No context"
132
  query = generate_prompt(context=context, question=user_input)
133
+ # response = llm.invoke(query)
134
+
135
+ response = llm.create_chat_completion(
136
+ messages = [
137
+ {
138
+ "role": "user",
139
+ "content": f"{query}"
140
+ }
141
+ ]
142
+ )
143
 
144
  # Calculate response time
145
  response_time = round(time.time() - start_time, 2)
146
 
147
  # Extract thoughts and main answer
148
+ thinking_part, main_answer = extract_thoughts(response['choices'][0]['message']['content'])
149
 
150
  # Display AI response
151
  with st.chat_message("assistant"):
 
162
  st.markdown(formatted_response, unsafe_allow_html=True)
163
 
164
  # Save to session history
165
+ st.session_state.messages.append({"role": "assistant", "content": formatted_response, "thinking_part": thinking_part})
requirements.txt CHANGED
@@ -9,4 +9,5 @@ faiss-cpu
9
  pymupdf
10
  ollama
11
  langchain_ollama
12
- langchain_huggingface
 
 
9
  pymupdf
10
  ollama
11
  langchain_ollama
12
+ langchain_huggingface
13
+ llama-cpp-python
services/llm.py CHANGED
@@ -1,17 +1,24 @@
1
  from langchain_ollama import OllamaLLM
 
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
  import streamlit as st
4
 
5
 
6
  @st.cache_resource
7
  def initialize_llm(model_name, temperature, top_p, max_tokens):
8
- # Configure the LLM with additional parameters
9
- llm = OllamaLLM(
10
- model=model_name,
11
- base_url="https://deepak7376-ollama-server.hf.space",
12
- temperature=temperature, # Controls randomness (0 = deterministic, 1 = max randomness)
13
- max_tokens=max_tokens, # Limit the number of tokens in the output
14
- top_p=top_p # Nucleus sampling for controlling diversity
 
 
 
 
 
 
15
  )
16
  return llm
17
 
 
1
  from langchain_ollama import OllamaLLM
2
+ from llama_cpp import Llama
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  import streamlit as st
5
 
6
 
7
  @st.cache_resource
8
  def initialize_llm(model_name, temperature, top_p, max_tokens):
9
+ # # Configure the LLM with additional parameters
10
+ # llm = OllamaLLM(
11
+ # model=model_name,
12
+ # base_url="https://deepak7376-ollama-server.hf.space",
13
+ # temperature=temperature, # Controls randomness (0 = deterministic, 1 = max randomness)
14
+ # max_tokens=max_tokens, # Limit the number of tokens in the output
15
+ # top_p=top_p # Nucleus sampling for controlling diversity
16
+ # )
17
+ llm = Llama.from_pretrained(
18
+ repo_id="bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF",
19
+ filename="DeepSeek-R1-Distill-Qwen-1.5B-IQ4_XS.gguf",
20
+ n_ctx=max_tokens
21
+
22
  )
23
  return llm
24