Spaces:

danishjameel003
/

CSSChatbot

Sleeping

App Files Files Community

CSSChatbot / app.py

danishjameel003

Update app.py

8e786ac verified 6 months ago

raw

history blame

6.06 kB

	import os
	import torch
	import streamlit as st
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_core.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from langchain_community.llms import HuggingFacePipeline
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	from dotenv import load_dotenv

	# Set Streamlit page configuration
	st.set_page_config(page_title="Chat with Notes and AI", page_icon=":books:", layout="wide")

	# Load environment variables
	load_dotenv()

	# Optimized Dolly-v2 model pipeline
	@st.cache_resource
	def load_pipeline():
	model_name = "databricks/dolly-v2-1b" # Smaller model for CPU

	# Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32, # Use float32 for CPU
	device_map="auto",
	trust_remote_code=True,
	offload_folder="./offload_weights" # Folder to store weights if needed
	)

	# Create text-generation pipeline
	return pipeline(
	task="text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=50, # Limit response length for speed
	return_full_text=False,
	device_map="auto"
	)

	# Initialize Dolly pipeline
	generate_text = load_pipeline()

	# Create HuggingFace pipeline wrapper for LangChain
	hf_pipeline = HuggingFacePipeline(pipeline=generate_text)

	# Prompt templates
	prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")
	prompt_with_context = PromptTemplate(input_variables=["instruction", "context"], template="{instruction}\n\nInput:\n{context}")

	# Create LLM chains
	llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
	llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)

	# Extract text from .txt files
	def get_text_files_content(folder):
	text = ""
	for filename in os.listdir(folder):
	if filename.endswith('.txt'):
	with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
	text += file.read() + "\n"
	return text

	# Convert text into smaller chunks
	def get_chunks(raw_text):
	from langchain.text_splitter import CharacterTextSplitter
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=512, # Smaller chunks for faster processing
	chunk_overlap=50, # Minimal overlap
	length_function=len
	)
	return text_splitter.split_text(raw_text)

	# Create FAISS vectorstore
	def get_vectorstore(chunks):
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'} # Force CPU usage for embeddings
	)
	return FAISS.from_texts(texts=chunks, embedding=embeddings)

	# Generate response from user queries
	def handle_question(question, vectorstore=None):
	if vectorstore:
	documents = vectorstore.similarity_search(question, k=1) # Retrieve fewer chunks
	context = "\n".join([doc.page_content for doc in documents])[:512] # Shorter context

	if context:
	result_with_context = llm_context_chain.invoke({"instruction": question, "context": context})
	return result_with_context

	# Fallback to instruction-only chain if no context is found
	return llm_chain.invoke({"instruction": question})

	def main():
	st.title("Chat with Notes :books:")

	# Initialize session state
	if "vectorstore" not in st.session_state:
	st.session_state.vectorstore = None

	# Define folders for Current Affairs and Essays
	data_folder = "data" # Current Affairs folders
	essay_folder = "essays" # Essays folder

	# Sidebar for content selection
	content_type = st.sidebar.radio("Select Content Type:", ["Current Affairs", "Essays"])

	# Handle folder-based selection
	if content_type == "Current Affairs":
	subjects = [f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f))] if os.path.exists(data_folder) else []
	elif content_type == "Essays":
	subjects = [f.replace(".txt", "") for f in os.listdir(essay_folder) if f.endswith('.txt')] if os.path.exists(essay_folder) else []

	selected_subject = st.sidebar.selectbox("Select a Subject:", subjects)

	# Process the selected subject
	raw_text = ""
	if content_type == "Current Affairs" and selected_subject:
	subject_folder = os.path.join(data_folder, selected_subject)
	raw_text = get_text_files_content(subject_folder)
	elif content_type == "Essays" and selected_subject:
	subject_file = os.path.join(essay_folder, selected_subject + ".txt")
	if os.path.exists(subject_file):
	with open(subject_file, "r", encoding="utf-8") as file:
	raw_text = file.read()

	# Display preview of notes and load vectorstore
	if raw_text:
	st.subheader("Preview of Notes")
	st.text_area("Preview Content:", value=raw_text[:1000], height=300, disabled=True) # Display shorter preview

	# Preload vectorstore if not already cached
	if "vectorstore" not in st.session_state or st.session_state.vectorstore is None:
	text_chunks = get_chunks(raw_text)
	st.session_state.vectorstore = get_vectorstore(text_chunks)
	else:
	st.warning("No content available for the selected subject.")

	# Chat interface
	st.subheader("Ask Your Question")
	question = st.text_input("Ask a question about your selected subject:")
	if question:
	if st.session_state.vectorstore:
	response = handle_question(question, st.session_state.vectorstore)
	st.subheader("Answer:")
	st.write(response.get("text", "No response found."))
	else:
	st.warning("Please load the content for the selected subject before asking a question.")

	if __name__ == '__main__':
	main()