nkcong206 commited on
Commit
dca18ab
·
verified ·
1 Parent(s): af5ec80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -40
app.py CHANGED
@@ -35,7 +35,6 @@ if "save_dir" not in st.session_state:
35
  if "uploaded_files" not in st.session_state:
36
  st.session_state.uploaded_files = set()
37
 
38
- # Caching functions
39
  @st.cache_resource
40
  def get_chat_google_model(api_key):
41
  os.environ["GOOGLE_API_KEY"] = api_key
@@ -60,27 +59,22 @@ def get_embedding_model():
60
  )
61
  return model
62
 
63
- # Load and process text files
64
  def load_txt(file_path):
65
  loader = TextLoader(file_path=file_path, encoding="utf-8")
66
  doc = loader.load()
67
  return doc
68
 
69
- def format_docs(docs):
70
- return "\n\n".join(doc.page_content for doc in docs)
71
-
72
- # Compute RAG Chain
73
  @st.cache_resource
74
  def compute_rag_chain(_model, _embd, docs_texts):
75
  if not docs_texts:
76
  raise ValueError("No documents to process. Please upload valid text files.")
77
 
78
  combined_text = "\n\n".join(docs_texts)
79
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
80
  texts = text_splitter.split_text(combined_text)
81
 
82
- if not texts:
83
- raise ValueError("Text splitter did not generate any text chunks. Check your input.")
84
 
85
  vectorstore = Chroma.from_texts(texts=texts, embedding=_embd)
86
  retriever = vectorstore.as_retriever()
@@ -132,43 +126,23 @@ if st.session_state.save_dir is None:
132
  os.makedirs(save_dir)
133
  st.session_state.save_dir = save_dir
134
 
135
- # Sidebar to upload files
136
  with st.sidebar:
137
  uploaded_files = st.file_uploader("Chọn file txt", accept_multiple_files=True, type=["txt"])
 
138
  if uploaded_files:
139
  documents = []
140
- uploaded_file_names = set()
141
  for uploaded_file in uploaded_files:
142
- uploaded_file_names.add(uploaded_file.name)
143
- if uploaded_file.name not in st.session_state.uploaded_files:
144
- file_path = os.path.join(st.session_state.save_dir, uploaded_file.name)
145
- with open(file_path, mode='wb') as w:
146
- w.write(uploaded_file.getvalue())
147
- doc = load_txt(file_path)
148
- documents.extend([*doc])
 
 
 
149
 
150
  if documents:
151
  docs_texts = [d.page_content for d in documents]
152
  st.session_state.rag = compute_rag_chain(st.session_state.model, st.session_state.embd, docs_texts)
153
- st.session_state.uploaded_files = uploaded_file_names
154
-
155
- # Chat Interface
156
- if "chat_history" not in st.session_state:
157
- st.session_state.chat_history = []
158
-
159
- for message in st.session_state.chat_history:
160
- with st.chat_message(message["role"]):
161
- st.write(message["content"])
162
-
163
- prompt = st.chat_input("Bạn muốn hỏi gì?")
164
- if prompt and st.session_state.model:
165
- st.session_state.chat_history.append({"role": "user", "content": prompt})
166
- with st.chat_message("user"):
167
- st.write(prompt)
168
- with st.chat_message("assistant"):
169
- if st.session_state.rag:
170
- response = st.session_state.rag.invoke(prompt)
171
- else:
172
- response = st.session_state.model.invoke(prompt).content
173
- st.write(response)
174
- st.session_state.chat_history.append({"role": "assistant", "content": response})
 
35
  if "uploaded_files" not in st.session_state:
36
  st.session_state.uploaded_files = set()
37
 
 
38
  @st.cache_resource
39
  def get_chat_google_model(api_key):
40
  os.environ["GOOGLE_API_KEY"] = api_key
 
59
  )
60
  return model
61
 
 
62
  def load_txt(file_path):
63
  loader = TextLoader(file_path=file_path, encoding="utf-8")
64
  doc = loader.load()
65
  return doc
66
 
 
 
 
 
67
  @st.cache_resource
68
  def compute_rag_chain(_model, _embd, docs_texts):
69
  if not docs_texts:
70
  raise ValueError("No documents to process. Please upload valid text files.")
71
 
72
  combined_text = "\n\n".join(docs_texts)
73
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
74
  texts = text_splitter.split_text(combined_text)
75
 
76
+ if len(texts) > 5000:
77
+ raise ValueError("The document creates too many chunks. Please use smaller documents.")
78
 
79
  vectorstore = Chroma.from_texts(texts=texts, embedding=_embd)
80
  retriever = vectorstore.as_retriever()
 
126
  os.makedirs(save_dir)
127
  st.session_state.save_dir = save_dir
128
 
 
129
  with st.sidebar:
130
  uploaded_files = st.file_uploader("Chọn file txt", accept_multiple_files=True, type=["txt"])
131
+ max_file_size_mb = 5
132
  if uploaded_files:
133
  documents = []
 
134
  for uploaded_file in uploaded_files:
135
+ if uploaded_file.size > max_file_size_mb * 1024 * 1024:
136
+ st.warning(f"Tệp {uploaded_file.name} vượt quá giới hạn {max_file_size_mb}MB.")
137
+ continue
138
+
139
+ file_path = os.path.join(st.session_state.save_dir, uploaded_file.name)
140
+ with open(file_path, mode='wb') as w:
141
+ w.write(uploaded_file.getvalue())
142
+
143
+ doc = load_txt(file_path)
144
+ documents.extend([*doc])
145
 
146
  if documents:
147
  docs_texts = [d.page_content for d in documents]
148
  st.session_state.rag = compute_rag_chain(st.session_state.model, st.session_state.embd, docs_texts)