Update app.py
Browse files
app.py
CHANGED
@@ -35,7 +35,6 @@ if "save_dir" not in st.session_state:
|
|
35 |
if "uploaded_files" not in st.session_state:
|
36 |
st.session_state.uploaded_files = set()
|
37 |
|
38 |
-
# Caching functions
|
39 |
@st.cache_resource
|
40 |
def get_chat_google_model(api_key):
|
41 |
os.environ["GOOGLE_API_KEY"] = api_key
|
@@ -60,27 +59,22 @@ def get_embedding_model():
|
|
60 |
)
|
61 |
return model
|
62 |
|
63 |
-
# Load and process text files
|
64 |
def load_txt(file_path):
|
65 |
loader = TextLoader(file_path=file_path, encoding="utf-8")
|
66 |
doc = loader.load()
|
67 |
return doc
|
68 |
|
69 |
-
def format_docs(docs):
|
70 |
-
return "\n\n".join(doc.page_content for doc in docs)
|
71 |
-
|
72 |
-
# Compute RAG Chain
|
73 |
@st.cache_resource
|
74 |
def compute_rag_chain(_model, _embd, docs_texts):
|
75 |
if not docs_texts:
|
76 |
raise ValueError("No documents to process. Please upload valid text files.")
|
77 |
|
78 |
combined_text = "\n\n".join(docs_texts)
|
79 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=
|
80 |
texts = text_splitter.split_text(combined_text)
|
81 |
|
82 |
-
if
|
83 |
-
raise ValueError("
|
84 |
|
85 |
vectorstore = Chroma.from_texts(texts=texts, embedding=_embd)
|
86 |
retriever = vectorstore.as_retriever()
|
@@ -132,43 +126,23 @@ if st.session_state.save_dir is None:
|
|
132 |
os.makedirs(save_dir)
|
133 |
st.session_state.save_dir = save_dir
|
134 |
|
135 |
-
# Sidebar to upload files
|
136 |
with st.sidebar:
|
137 |
uploaded_files = st.file_uploader("Chọn file txt", accept_multiple_files=True, type=["txt"])
|
|
|
138 |
if uploaded_files:
|
139 |
documents = []
|
140 |
-
uploaded_file_names = set()
|
141 |
for uploaded_file in uploaded_files:
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
149 |
|
150 |
if documents:
|
151 |
docs_texts = [d.page_content for d in documents]
|
152 |
st.session_state.rag = compute_rag_chain(st.session_state.model, st.session_state.embd, docs_texts)
|
153 |
-
st.session_state.uploaded_files = uploaded_file_names
|
154 |
-
|
155 |
-
# Chat Interface
|
156 |
-
if "chat_history" not in st.session_state:
|
157 |
-
st.session_state.chat_history = []
|
158 |
-
|
159 |
-
for message in st.session_state.chat_history:
|
160 |
-
with st.chat_message(message["role"]):
|
161 |
-
st.write(message["content"])
|
162 |
-
|
163 |
-
prompt = st.chat_input("Bạn muốn hỏi gì?")
|
164 |
-
if prompt and st.session_state.model:
|
165 |
-
st.session_state.chat_history.append({"role": "user", "content": prompt})
|
166 |
-
with st.chat_message("user"):
|
167 |
-
st.write(prompt)
|
168 |
-
with st.chat_message("assistant"):
|
169 |
-
if st.session_state.rag:
|
170 |
-
response = st.session_state.rag.invoke(prompt)
|
171 |
-
else:
|
172 |
-
response = st.session_state.model.invoke(prompt).content
|
173 |
-
st.write(response)
|
174 |
-
st.session_state.chat_history.append({"role": "assistant", "content": response})
|
|
|
35 |
if "uploaded_files" not in st.session_state:
|
36 |
st.session_state.uploaded_files = set()
|
37 |
|
|
|
38 |
@st.cache_resource
|
39 |
def get_chat_google_model(api_key):
|
40 |
os.environ["GOOGLE_API_KEY"] = api_key
|
|
|
59 |
)
|
60 |
return model
|
61 |
|
|
|
62 |
def load_txt(file_path):
|
63 |
loader = TextLoader(file_path=file_path, encoding="utf-8")
|
64 |
doc = loader.load()
|
65 |
return doc
|
66 |
|
|
|
|
|
|
|
|
|
67 |
@st.cache_resource
|
68 |
def compute_rag_chain(_model, _embd, docs_texts):
|
69 |
if not docs_texts:
|
70 |
raise ValueError("No documents to process. Please upload valid text files.")
|
71 |
|
72 |
combined_text = "\n\n".join(docs_texts)
|
73 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
74 |
texts = text_splitter.split_text(combined_text)
|
75 |
|
76 |
+
if len(texts) > 5000:
|
77 |
+
raise ValueError("The document creates too many chunks. Please use smaller documents.")
|
78 |
|
79 |
vectorstore = Chroma.from_texts(texts=texts, embedding=_embd)
|
80 |
retriever = vectorstore.as_retriever()
|
|
|
126 |
os.makedirs(save_dir)
|
127 |
st.session_state.save_dir = save_dir
|
128 |
|
|
|
129 |
with st.sidebar:
|
130 |
uploaded_files = st.file_uploader("Chọn file txt", accept_multiple_files=True, type=["txt"])
|
131 |
+
max_file_size_mb = 5
|
132 |
if uploaded_files:
|
133 |
documents = []
|
|
|
134 |
for uploaded_file in uploaded_files:
|
135 |
+
if uploaded_file.size > max_file_size_mb * 1024 * 1024:
|
136 |
+
st.warning(f"Tệp {uploaded_file.name} vượt quá giới hạn {max_file_size_mb}MB.")
|
137 |
+
continue
|
138 |
+
|
139 |
+
file_path = os.path.join(st.session_state.save_dir, uploaded_file.name)
|
140 |
+
with open(file_path, mode='wb') as w:
|
141 |
+
w.write(uploaded_file.getvalue())
|
142 |
+
|
143 |
+
doc = load_txt(file_path)
|
144 |
+
documents.extend([*doc])
|
145 |
|
146 |
if documents:
|
147 |
docs_texts = [d.page_content for d in documents]
|
148 |
st.session_state.rag = compute_rag_chain(st.session_state.model, st.session_state.embd, docs_texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|