danishjameel003 commited on
Commit
3455401
·
verified ·
1 Parent(s): 14e71a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -51
app.py CHANGED
@@ -15,50 +15,56 @@ st.set_page_config(page_title="Chat with Notes and AI", page_icon=":books:", lay
15
  # Load environment variables
16
  load_dotenv()
17
 
18
- # Optimized pipeline setup
19
  @st.cache_resource
20
  def load_pipeline():
21
- # Use a smaller model for faster performance
22
- model_name = "databricks/dolly-v2-1b" # Switch to a lighter model
23
-
24
- # Load tokenizer and model
25
  tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
 
 
26
  model = AutoModelForCausalLM.from_pretrained(
27
  model_name,
28
- torch_dtype=torch.float32, # Use float32 for CPU compatibility
29
- device_map="auto", # Automatically map devices
30
- trust_remote_code=True
 
31
  )
32
 
33
- # Return text-generation pipeline with full-text output
34
  return pipeline(
35
  task="text-generation",
36
  model=model,
37
  tokenizer=tokenizer,
38
- torch_dtype=torch.float32, # Ensure compatibility with CPU
39
  device_map="auto",
40
- return_full_text=True,
41
- max_new_tokens=100 # Limit response length
42
  )
43
 
44
- # Initialize pipeline
45
  generate_text = load_pipeline()
46
 
47
- # LangChain Integration
48
  hf_pipeline = HuggingFacePipeline(pipeline=generate_text)
49
 
50
- # Templates for prompts
51
- prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")
 
 
 
 
 
52
  prompt_with_context = PromptTemplate(
53
  input_variables=["instruction", "context"],
54
  template="{instruction}\n\nInput:\n{context}"
55
  )
56
 
57
- # LangChain LLM chains
58
  llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
59
  llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)
60
 
61
- # Extract content from .txt files
62
  def get_text_files_content(folder):
63
  text = ""
64
  for filename in os.listdir(folder):
@@ -67,92 +73,107 @@ def get_text_files_content(folder):
67
  text += file.read() + "\n"
68
  return text
69
 
70
- # Convert text into chunks for vectorization
71
  def get_chunks(raw_text):
72
  from langchain.text_splitter import CharacterTextSplitter
73
  text_splitter = CharacterTextSplitter(
74
  separator="\n",
75
- chunk_size=500, # Smaller chunks for faster processing
76
- chunk_overlap=50 # Minimal overlap
 
77
  )
78
- return text_splitter.split_text(raw_text)
 
79
 
80
- # Create FAISS vectorstore for embeddings
81
  def get_vectorstore(chunks):
82
  embeddings = HuggingFaceEmbeddings(
83
- model_name="sentence-transformers/all-MiniLM-L6-v2", # Lightweight embeddings
84
- model_kwargs={'device': 'cpu'} # Ensure embeddings run on CPU
85
  )
86
- return FAISS.from_texts(texts=chunks, embedding=embeddings)
 
87
 
88
- # Handle user queries
89
  def handle_question(question, vectorstore=None):
90
  if vectorstore:
91
- # Retrieve the most relevant chunk
92
- documents = vectorstore.similarity_search(question, k=1) # Retrieve fewer chunks
93
- context = "\n".join([doc.page_content for doc in documents])[:500] # Short context for efficiency
 
 
 
94
 
95
  if context:
96
- return llm_context_chain.predict(instruction=question, context=context).strip()
 
97
 
98
- # Fallback to instruction-only chain if no context
99
- return llm_chain.predict(instruction=question).strip()
100
 
101
  def main():
102
  st.title("Chat with Notes :books:")
103
 
104
- # Session state for vectorstore
105
  if "vectorstore" not in st.session_state:
106
  st.session_state.vectorstore = None
107
 
108
- # Data folders
109
- data_folder = "data" # Folder for Current Affairs
110
- essay_folder = "essays" # Folder for Essays
111
 
112
  # Content type selection
113
  content_type = st.sidebar.radio("Select Content Type:", ["Current Affairs", "Essays"])
114
 
115
- # Subjects based on content type
116
  if content_type == "Current Affairs":
117
- subjects = [f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f))] if os.path.exists(data_folder) else []
118
- else:
119
- subjects = [f.replace(".txt", "") for f in os.listdir(essay_folder) if f.endswith('.txt')] if os.path.exists(essay_folder) else []
 
 
 
 
 
 
 
120
 
121
  # Subject selection
122
  selected_subject = st.sidebar.selectbox("Select a Subject:", subjects)
123
 
124
- # Load content based on selection
125
  raw_text = ""
126
  if content_type == "Current Affairs" and selected_subject:
127
  subject_folder = os.path.join(data_folder, selected_subject)
128
  raw_text = get_text_files_content(subject_folder)
129
  elif content_type == "Essays" and selected_subject:
130
- subject_file = os.path.join(essay_folder, f"{selected_subject}.txt")
131
  if os.path.exists(subject_file):
132
  with open(subject_file, "r", encoding="utf-8") as file:
133
  raw_text = file.read()
134
 
135
- # Display preview and create vectorstore
136
  if raw_text:
137
  st.subheader("Preview of Notes")
138
- st.text_area("Preview Content:", value=raw_text[:1000], height=300, disabled=True)
139
 
140
- if "vectorstore" not in st.session_state or st.session_state.vectorstore is None:
141
- chunks = get_chunks(raw_text)
142
- st.session_state.vectorstore = get_vectorstore(chunks)
 
143
  else:
144
  st.warning("No content available for the selected subject.")
145
 
146
- # Question and response
147
  st.subheader("Ask Your Question")
148
  question = st.text_input("Ask a question about your selected subject:")
149
  if question:
150
  if st.session_state.vectorstore:
151
  response = handle_question(question, st.session_state.vectorstore)
152
  st.subheader("Answer:")
153
- st.write(response or "No response found.")
154
  else:
155
  st.warning("Please load the content for the selected subject before asking a question.")
156
 
157
- if __name__ == "__main__":
158
  main()
 
15
  # Load environment variables
16
  load_dotenv()
17
 
18
+ # Dolly-v2-3b model pipeline
19
  @st.cache_resource
20
  def load_pipeline():
21
+ model_name = "databricks/dolly-v2-3b"
22
+
23
+ # Load tokenizer
 
24
  tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
25
+
26
+ # Load model with offload folder for disk storage of weights
27
  model = AutoModelForCausalLM.from_pretrained(
28
  model_name,
29
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, # Use float16 for GPU, float32 for CPU
30
+ device_map="auto", # Automatically map model to available devices (e.g., GPU if available)
31
+ trust_remote_code=True,
32
+ offload_folder="./offload_weights" # Folder to store offloaded weights
33
  )
34
 
35
+ # Return text-generation pipeline
36
  return pipeline(
37
  task="text-generation",
38
  model=model,
39
  tokenizer=tokenizer,
40
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
41
  device_map="auto",
42
+ return_full_text=True
 
43
  )
44
 
45
+ # Initialize Dolly pipeline
46
  generate_text = load_pipeline()
47
 
48
+ # Create a HuggingFace pipeline wrapper for LangChain
49
  hf_pipeline = HuggingFacePipeline(pipeline=generate_text)
50
 
51
+ # Template for instruction-only prompts
52
+ prompt = PromptTemplate(
53
+ input_variables=["instruction"],
54
+ template="{instruction}"
55
+ )
56
+
57
+ # Template for prompts with context
58
  prompt_with_context = PromptTemplate(
59
  input_variables=["instruction", "context"],
60
  template="{instruction}\n\nInput:\n{context}"
61
  )
62
 
63
+ # Create LLM chains
64
  llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
65
  llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)
66
 
67
+ # Extracting text from .txt files
68
  def get_text_files_content(folder):
69
  text = ""
70
  for filename in os.listdir(folder):
 
73
  text += file.read() + "\n"
74
  return text
75
 
76
+ # Converting text to chunks
77
  def get_chunks(raw_text):
78
  from langchain.text_splitter import CharacterTextSplitter
79
  text_splitter = CharacterTextSplitter(
80
  separator="\n",
81
+ chunk_size=1000, # Reduced chunk size for faster processing
82
+ chunk_overlap=200, # Smaller overlap for efficiency
83
+ length_function=len
84
  )
85
+ chunks = text_splitter.split_text(raw_text)
86
+ return chunks
87
 
88
+ # Using Hugging Face embeddings model and FAISS to create vectorstore
89
  def get_vectorstore(chunks):
90
  embeddings = HuggingFaceEmbeddings(
91
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
92
+ model_kwargs={'device': 'cpu'} # Ensure embeddings use CPU
93
  )
94
+ vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)
95
+ return vectorstore
96
 
97
+ # Generating response from user queries
98
  def handle_question(question, vectorstore=None):
99
  if vectorstore:
100
+ # Reduce the number of retrieved chunks for faster processing
101
+ documents = vectorstore.similarity_search(question, k=2)
102
+ context = "\n".join([doc.page_content for doc in documents])
103
+
104
+ # Limit context to 1000 characters to speed up model inference
105
+ context = context[:1000]
106
 
107
  if context:
108
+ result_with_context = llm_context_chain.invoke({"instruction": question, "context": context})
109
+ return result_with_context
110
 
111
+ # Fallback to instruction-only chain if no context is found
112
+ return llm_chain.invoke({"instruction": question})
113
 
114
  def main():
115
  st.title("Chat with Notes :books:")
116
 
117
+ # Initialize session state
118
  if "vectorstore" not in st.session_state:
119
  st.session_state.vectorstore = None
120
 
121
+ # Define folders for Current Affairs and Essays
122
+ data_folder = "data" # Current Affairs folders
123
+ essay_folder = "essays" # Essays folder
124
 
125
  # Content type selection
126
  content_type = st.sidebar.radio("Select Content Type:", ["Current Affairs", "Essays"])
127
 
128
+ # Handle Current Affairs (each subject has its own folder)
129
  if content_type == "Current Affairs":
130
+ if os.path.exists(data_folder):
131
+ subjects = [f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f))]
132
+ else:
133
+ subjects = []
134
+ # Handle Essays (all essays are in a single folder)
135
+ elif content_type == "Essays":
136
+ if os.path.exists(essay_folder):
137
+ subjects = [f.replace(".txt", "") for f in os.listdir(essay_folder) if f.endswith('.txt')]
138
+ else:
139
+ subjects = []
140
 
141
  # Subject selection
142
  selected_subject = st.sidebar.selectbox("Select a Subject:", subjects)
143
 
144
+ # Process selected subject
145
  raw_text = ""
146
  if content_type == "Current Affairs" and selected_subject:
147
  subject_folder = os.path.join(data_folder, selected_subject)
148
  raw_text = get_text_files_content(subject_folder)
149
  elif content_type == "Essays" and selected_subject:
150
+ subject_file = os.path.join(essay_folder, selected_subject + ".txt")
151
  if os.path.exists(subject_file):
152
  with open(subject_file, "r", encoding="utf-8") as file:
153
  raw_text = file.read()
154
 
155
+ # Display preview of notes
156
  if raw_text:
157
  st.subheader("Preview of Notes")
158
+ st.text_area("Preview Content:", value=raw_text[:2000], height=300, disabled=True) # Show a snippet of the notes
159
 
160
+ # Create vectorstore for Current Affairs or Essays
161
+ text_chunks = get_chunks(raw_text)
162
+ vectorstore = get_vectorstore(text_chunks)
163
+ st.session_state.vectorstore = vectorstore
164
  else:
165
  st.warning("No content available for the selected subject.")
166
 
167
+ # Chat interface
168
  st.subheader("Ask Your Question")
169
  question = st.text_input("Ask a question about your selected subject:")
170
  if question:
171
  if st.session_state.vectorstore:
172
  response = handle_question(question, st.session_state.vectorstore)
173
  st.subheader("Answer:")
174
+ st.write(response.get("text", "No response found."))
175
  else:
176
  st.warning("Please load the content for the selected subject before asking a question.")
177
 
178
+ if __name__ == '__main__':
179
  main()