Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,46 +15,50 @@ st.set_page_config(page_title="Chat with Notes and AI", page_icon=":books:", lay
|
|
15 |
# Load environment variables
|
16 |
load_dotenv()
|
17 |
|
18 |
-
# Optimized
|
19 |
@st.cache_resource
|
20 |
def load_pipeline():
|
21 |
-
|
22 |
-
|
|
|
23 |
# Load tokenizer and model
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
|
25 |
model = AutoModelForCausalLM.from_pretrained(
|
26 |
model_name,
|
27 |
-
torch_dtype=torch.float32, # Use float32 for CPU
|
28 |
-
device_map="auto",
|
29 |
-
trust_remote_code=True
|
30 |
-
offload_folder="./offload_weights" # Folder to store weights if needed
|
31 |
)
|
32 |
|
33 |
-
#
|
34 |
return pipeline(
|
35 |
task="text-generation",
|
36 |
model=model,
|
37 |
tokenizer=tokenizer,
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
41 |
)
|
42 |
|
43 |
-
# Initialize
|
44 |
generate_text = load_pipeline()
|
45 |
|
46 |
-
#
|
47 |
hf_pipeline = HuggingFacePipeline(pipeline=generate_text)
|
48 |
|
49 |
-
#
|
50 |
prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")
|
51 |
-
prompt_with_context = PromptTemplate(
|
|
|
|
|
|
|
52 |
|
53 |
-
#
|
54 |
llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
|
55 |
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)
|
56 |
|
57 |
-
# Extract
|
58 |
def get_text_files_content(folder):
|
59 |
text = ""
|
60 |
for filename in os.listdir(folder):
|
@@ -63,93 +67,92 @@ def get_text_files_content(folder):
|
|
63 |
text += file.read() + "\n"
|
64 |
return text
|
65 |
|
66 |
-
# Convert text into
|
67 |
def get_chunks(raw_text):
|
68 |
from langchain.text_splitter import CharacterTextSplitter
|
69 |
text_splitter = CharacterTextSplitter(
|
70 |
separator="\n",
|
71 |
-
chunk_size=
|
72 |
-
chunk_overlap=50
|
73 |
-
length_function=len
|
74 |
)
|
75 |
return text_splitter.split_text(raw_text)
|
76 |
|
77 |
-
# Create FAISS vectorstore
|
78 |
def get_vectorstore(chunks):
|
79 |
embeddings = HuggingFaceEmbeddings(
|
80 |
-
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
81 |
-
model_kwargs={'device': 'cpu'} #
|
82 |
)
|
83 |
return FAISS.from_texts(texts=chunks, embedding=embeddings)
|
84 |
|
85 |
-
#
|
86 |
def handle_question(question, vectorstore=None):
|
87 |
if vectorstore:
|
|
|
88 |
documents = vectorstore.similarity_search(question, k=1) # Retrieve fewer chunks
|
89 |
-
context = "\n".join([doc.page_content for doc in documents])[:
|
90 |
|
91 |
if context:
|
92 |
-
|
93 |
-
return result_with_context
|
94 |
|
95 |
-
# Fallback to instruction-only chain if no context
|
96 |
-
return llm_chain.
|
97 |
|
98 |
def main():
|
99 |
st.title("Chat with Notes :books:")
|
100 |
|
101 |
-
#
|
102 |
if "vectorstore" not in st.session_state:
|
103 |
st.session_state.vectorstore = None
|
104 |
|
105 |
-
#
|
106 |
-
data_folder = "data" # Current Affairs
|
107 |
-
essay_folder = "essays" # Essays
|
108 |
|
109 |
-
#
|
110 |
content_type = st.sidebar.radio("Select Content Type:", ["Current Affairs", "Essays"])
|
111 |
|
112 |
-
#
|
113 |
if content_type == "Current Affairs":
|
114 |
subjects = [f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f))] if os.path.exists(data_folder) else []
|
115 |
-
|
116 |
subjects = [f.replace(".txt", "") for f in os.listdir(essay_folder) if f.endswith('.txt')] if os.path.exists(essay_folder) else []
|
117 |
|
|
|
118 |
selected_subject = st.sidebar.selectbox("Select a Subject:", subjects)
|
119 |
|
120 |
-
#
|
121 |
raw_text = ""
|
122 |
if content_type == "Current Affairs" and selected_subject:
|
123 |
subject_folder = os.path.join(data_folder, selected_subject)
|
124 |
raw_text = get_text_files_content(subject_folder)
|
125 |
elif content_type == "Essays" and selected_subject:
|
126 |
-
subject_file = os.path.join(essay_folder, selected_subject
|
127 |
if os.path.exists(subject_file):
|
128 |
with open(subject_file, "r", encoding="utf-8") as file:
|
129 |
raw_text = file.read()
|
130 |
|
131 |
-
# Display preview
|
132 |
if raw_text:
|
133 |
st.subheader("Preview of Notes")
|
134 |
-
st.text_area("Preview Content:", value=raw_text[:1000], height=300, disabled=True)
|
135 |
|
136 |
-
# Preload vectorstore if not already cached
|
137 |
if "vectorstore" not in st.session_state or st.session_state.vectorstore is None:
|
138 |
-
|
139 |
-
st.session_state.vectorstore = get_vectorstore(
|
140 |
else:
|
141 |
st.warning("No content available for the selected subject.")
|
142 |
|
143 |
-
#
|
144 |
st.subheader("Ask Your Question")
|
145 |
question = st.text_input("Ask a question about your selected subject:")
|
146 |
if question:
|
147 |
if st.session_state.vectorstore:
|
148 |
response = handle_question(question, st.session_state.vectorstore)
|
149 |
st.subheader("Answer:")
|
150 |
-
st.write(response
|
151 |
else:
|
152 |
st.warning("Please load the content for the selected subject before asking a question.")
|
153 |
|
154 |
-
if __name__ ==
|
155 |
main()
|
|
|
15 |
# Load environment variables
|
16 |
load_dotenv()
|
17 |
|
18 |
+
# Optimized pipeline setup
|
19 |
@st.cache_resource
|
20 |
def load_pipeline():
|
21 |
+
# Use a smaller model for faster performance
|
22 |
+
model_name = "databricks/dolly-v2-1b" # Switch to a lighter model
|
23 |
+
|
24 |
# Load tokenizer and model
|
25 |
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)
|
26 |
model = AutoModelForCausalLM.from_pretrained(
|
27 |
model_name,
|
28 |
+
torch_dtype=torch.float32, # Use float32 for CPU compatibility
|
29 |
+
device_map="auto", # Automatically map devices
|
30 |
+
trust_remote_code=True
|
|
|
31 |
)
|
32 |
|
33 |
+
# Return text-generation pipeline with full-text output
|
34 |
return pipeline(
|
35 |
task="text-generation",
|
36 |
model=model,
|
37 |
tokenizer=tokenizer,
|
38 |
+
torch_dtype=torch.float32, # Ensure compatibility with CPU
|
39 |
+
device_map="auto",
|
40 |
+
return_full_text=True,
|
41 |
+
max_new_tokens=100 # Limit response length
|
42 |
)
|
43 |
|
44 |
+
# Initialize pipeline
|
45 |
generate_text = load_pipeline()
|
46 |
|
47 |
+
# LangChain Integration
|
48 |
hf_pipeline = HuggingFacePipeline(pipeline=generate_text)
|
49 |
|
50 |
+
# Templates for prompts
|
51 |
prompt = PromptTemplate(input_variables=["instruction"], template="{instruction}")
|
52 |
+
prompt_with_context = PromptTemplate(
|
53 |
+
input_variables=["instruction", "context"],
|
54 |
+
template="{instruction}\n\nInput:\n{context}"
|
55 |
+
)
|
56 |
|
57 |
+
# LangChain LLM chains
|
58 |
llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
|
59 |
llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)
|
60 |
|
61 |
+
# Extract content from .txt files
|
62 |
def get_text_files_content(folder):
|
63 |
text = ""
|
64 |
for filename in os.listdir(folder):
|
|
|
67 |
text += file.read() + "\n"
|
68 |
return text
|
69 |
|
70 |
+
# Convert text into chunks for vectorization
|
71 |
def get_chunks(raw_text):
|
72 |
from langchain.text_splitter import CharacterTextSplitter
|
73 |
text_splitter = CharacterTextSplitter(
|
74 |
separator="\n",
|
75 |
+
chunk_size=500, # Smaller chunks for faster processing
|
76 |
+
chunk_overlap=50 # Minimal overlap
|
|
|
77 |
)
|
78 |
return text_splitter.split_text(raw_text)
|
79 |
|
80 |
+
# Create FAISS vectorstore for embeddings
|
81 |
def get_vectorstore(chunks):
|
82 |
embeddings = HuggingFaceEmbeddings(
|
83 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2", # Lightweight embeddings
|
84 |
+
model_kwargs={'device': 'cpu'} # Ensure embeddings run on CPU
|
85 |
)
|
86 |
return FAISS.from_texts(texts=chunks, embedding=embeddings)
|
87 |
|
88 |
+
# Handle user queries
|
89 |
def handle_question(question, vectorstore=None):
|
90 |
if vectorstore:
|
91 |
+
# Retrieve the most relevant chunk
|
92 |
documents = vectorstore.similarity_search(question, k=1) # Retrieve fewer chunks
|
93 |
+
context = "\n".join([doc.page_content for doc in documents])[:500] # Short context for efficiency
|
94 |
|
95 |
if context:
|
96 |
+
return llm_context_chain.predict(instruction=question, context=context).strip()
|
|
|
97 |
|
98 |
+
# Fallback to instruction-only chain if no context
|
99 |
+
return llm_chain.predict(instruction=question).strip()
|
100 |
|
101 |
def main():
|
102 |
st.title("Chat with Notes :books:")
|
103 |
|
104 |
+
# Session state for vectorstore
|
105 |
if "vectorstore" not in st.session_state:
|
106 |
st.session_state.vectorstore = None
|
107 |
|
108 |
+
# Data folders
|
109 |
+
data_folder = "data" # Folder for Current Affairs
|
110 |
+
essay_folder = "essays" # Folder for Essays
|
111 |
|
112 |
+
# Content type selection
|
113 |
content_type = st.sidebar.radio("Select Content Type:", ["Current Affairs", "Essays"])
|
114 |
|
115 |
+
# Subjects based on content type
|
116 |
if content_type == "Current Affairs":
|
117 |
subjects = [f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f))] if os.path.exists(data_folder) else []
|
118 |
+
else:
|
119 |
subjects = [f.replace(".txt", "") for f in os.listdir(essay_folder) if f.endswith('.txt')] if os.path.exists(essay_folder) else []
|
120 |
|
121 |
+
# Subject selection
|
122 |
selected_subject = st.sidebar.selectbox("Select a Subject:", subjects)
|
123 |
|
124 |
+
# Load content based on selection
|
125 |
raw_text = ""
|
126 |
if content_type == "Current Affairs" and selected_subject:
|
127 |
subject_folder = os.path.join(data_folder, selected_subject)
|
128 |
raw_text = get_text_files_content(subject_folder)
|
129 |
elif content_type == "Essays" and selected_subject:
|
130 |
+
subject_file = os.path.join(essay_folder, f"{selected_subject}.txt")
|
131 |
if os.path.exists(subject_file):
|
132 |
with open(subject_file, "r", encoding="utf-8") as file:
|
133 |
raw_text = file.read()
|
134 |
|
135 |
+
# Display preview and create vectorstore
|
136 |
if raw_text:
|
137 |
st.subheader("Preview of Notes")
|
138 |
+
st.text_area("Preview Content:", value=raw_text[:1000], height=300, disabled=True)
|
139 |
|
|
|
140 |
if "vectorstore" not in st.session_state or st.session_state.vectorstore is None:
|
141 |
+
chunks = get_chunks(raw_text)
|
142 |
+
st.session_state.vectorstore = get_vectorstore(chunks)
|
143 |
else:
|
144 |
st.warning("No content available for the selected subject.")
|
145 |
|
146 |
+
# Question and response
|
147 |
st.subheader("Ask Your Question")
|
148 |
question = st.text_input("Ask a question about your selected subject:")
|
149 |
if question:
|
150 |
if st.session_state.vectorstore:
|
151 |
response = handle_question(question, st.session_state.vectorstore)
|
152 |
st.subheader("Answer:")
|
153 |
+
st.write(response or "No response found.")
|
154 |
else:
|
155 |
st.warning("Please load the content for the selected subject before asking a question.")
|
156 |
|
157 |
+
if __name__ == "__main__":
|
158 |
main()
|