Spaces:

nkcong206
/

AskUSTH

Sleeping

App Files Files Community

nkcong206 commited on Dec 7, 2024

Commit

dca18ab

verified ·

1 Parent(s): af5ec80

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -40

app.py CHANGED Viewed

@@ -35,7 +35,6 @@ if "save_dir" not in st.session_state:
 if "uploaded_files" not in st.session_state:
     st.session_state.uploaded_files = set()
-# Caching functions
 @st.cache_resource
 def get_chat_google_model(api_key):
     os.environ["GOOGLE_API_KEY"] = api_key
@@ -60,27 +59,22 @@ def get_embedding_model():
     )
     return model
-# Load and process text files
 def load_txt(file_path):
     loader = TextLoader(file_path=file_path, encoding="utf-8")
     doc = loader.load()
     return doc
-def format_docs(docs):
-    return "\n\n".join(doc.page_content for doc in docs)
-# Compute RAG Chain
 @st.cache_resource
 def compute_rag_chain(_model, _embd, docs_texts):
     if not docs_texts:
         raise ValueError("No documents to process. Please upload valid text files.")
     combined_text = "\n\n".join(docs_texts)
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
     texts = text_splitter.split_text(combined_text)
-    if not texts:
-        raise ValueError("Text splitter did not generate any text chunks. Check your input.")
     vectorstore = Chroma.from_texts(texts=texts, embedding=_embd)
     retriever = vectorstore.as_retriever()
@@ -132,43 +126,23 @@ if st.session_state.save_dir is None:
         os.makedirs(save_dir)
     st.session_state.save_dir = save_dir
-# Sidebar to upload files
 with st.sidebar:
     uploaded_files = st.file_uploader("Chọn file txt", accept_multiple_files=True, type=["txt"])
     if uploaded_files:
         documents = []
-        uploaded_file_names = set()
         for uploaded_file in uploaded_files:
-            uploaded_file_names.add(uploaded_file.name)
-            if uploaded_file.name not in st.session_state.uploaded_files:
-                file_path = os.path.join(st.session_state.save_dir, uploaded_file.name)
-                with open(file_path, mode='wb') as w:
-                    w.write(uploaded_file.getvalue())
-                doc = load_txt(file_path)
-                documents.extend([*doc])
         if documents:
             docs_texts = [d.page_content for d in documents]
             st.session_state.rag = compute_rag_chain(st.session_state.model, st.session_state.embd, docs_texts)
-        st.session_state.uploaded_files = uploaded_file_names
-# Chat Interface
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-for message in st.session_state.chat_history:
-    with st.chat_message(message["role"]):
-        st.write(message["content"])
-prompt = st.chat_input("Bạn muốn hỏi gì?")
-if prompt and st.session_state.model:
-    st.session_state.chat_history.append({"role": "user", "content": prompt})
-    with st.chat_message("user"):
-        st.write(prompt)
-    with st.chat_message("assistant"):
-        if st.session_state.rag:
-            response = st.session_state.rag.invoke(prompt)
-        else:
-            response = st.session_state.model.invoke(prompt).content
-        st.write(response)
-        st.session_state.chat_history.append({"role": "assistant", "content": response})

 if "uploaded_files" not in st.session_state:
     st.session_state.uploaded_files = set()
 @st.cache_resource
 def get_chat_google_model(api_key):
     os.environ["GOOGLE_API_KEY"] = api_key
     )
     return model
 def load_txt(file_path):
     loader = TextLoader(file_path=file_path, encoding="utf-8")
     doc = loader.load()
     return doc
 @st.cache_resource
 def compute_rag_chain(_model, _embd, docs_texts):
     if not docs_texts:
         raise ValueError("No documents to process. Please upload valid text files.")
     combined_text = "\n\n".join(docs_texts)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     texts = text_splitter.split_text(combined_text)
+    if len(texts) > 5000:
+        raise ValueError("The document creates too many chunks. Please use smaller documents.")
     vectorstore = Chroma.from_texts(texts=texts, embedding=_embd)
     retriever = vectorstore.as_retriever()
         os.makedirs(save_dir)
     st.session_state.save_dir = save_dir
 with st.sidebar:
     uploaded_files = st.file_uploader("Chọn file txt", accept_multiple_files=True, type=["txt"])
+    max_file_size_mb = 5
     if uploaded_files:
         documents = []
         for uploaded_file in uploaded_files:
+            if uploaded_file.size > max_file_size_mb * 1024 * 1024:
+                st.warning(f"Tệp {uploaded_file.name} vượt quá giới hạn {max_file_size_mb}MB.")
+                continue
+            file_path = os.path.join(st.session_state.save_dir, uploaded_file.name)
+            with open(file_path, mode='wb') as w:
+                w.write(uploaded_file.getvalue())
+            doc = load_txt(file_path)
+            documents.extend([*doc])
         if documents:
             docs_texts = [d.page_content for d in documents]
             st.session_state.rag = compute_rag_chain(st.session_state.model, st.session_state.embd, docs_texts)