Rulga commited on
Commit
231b18c
·
1 Parent(s): 199cd7b

feat: initialize dataset structure with chat history and vector store

Browse files
Files changed (2) hide show
  1. app.py +61 -44
  2. init_dataset.py +69 -0
app.py CHANGED
@@ -13,6 +13,19 @@ from langchain_core.runnables import RunnableLambda
13
  import requests
14
  import json
15
  from datetime import datetime
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Define base directory and absolute paths
18
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -217,43 +230,54 @@ def check_directory_permissions(directory):
217
  error_msg = f"Permission error: {str(e)} (Directory permissions: {permissions})"
218
  return False, error_msg
219
 
220
- def force_save_vector_store(vector_store):
221
- """Ensures vector store is properly saved to disk"""
222
  try:
223
- # Check directory permissions
224
- success, error_msg = check_directory_permissions(VECTOR_STORE_PATH)
225
- if not success:
226
- raise Exception(error_msg)
227
 
228
- # Save vector store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  vector_store.save_local(VECTOR_STORE_PATH)
230
 
231
- # Verify vector store files were created
232
- index_file = os.path.join(VECTOR_STORE_PATH, "index.faiss")
233
- if not os.path.exists(index_file):
234
- raise Exception("Vector store files were not created")
235
-
236
- # Verify file permissions
237
- if not os.access(index_file, os.R_OK | os.W_OK):
238
- raise Exception(f"Insufficient permissions for vector store files")
239
-
240
- #st.caption("✅ Vector store saved successfully")
241
- st.toast("✅ Vector store saved", icon="💾")
242
 
243
  except Exception as e:
244
- error_msg = f"Failed to save vector store: {str(e)}"
245
- st.caption(error_msg)
246
- st.error(error_msg) # Also show as error message
247
  raise Exception(error_msg)
248
 
249
  def force_save_chat_history(chat_entry):
250
- """Ensures chat history is properly saved to disk"""
251
  try:
252
- # Check directory permissions
253
- success, error_msg = check_directory_permissions(CHAT_HISTORY_DIR)
254
- if not success:
255
- raise Exception(error_msg)
256
-
257
  current_date = datetime.now().strftime("%Y-%m-%d")
258
  filename = os.path.join(CHAT_HISTORY_DIR, f"chat_history_{current_date}.json")
259
 
@@ -266,27 +290,20 @@ def force_save_chat_history(chat_entry):
266
  # Add new entry
267
  existing_history.append(chat_entry)
268
 
269
- # Save updated history with fsync to ensure disk write
270
  with open(filename, 'w', encoding='utf-8') as f:
271
  json.dump(existing_history, f, ensure_ascii=False, indent=2)
272
- f.flush()
273
- os.fsync(f.fileno())
274
- os.sync() # Принудительная синхронизация файловой системы
275
-
276
- # Verify file was created and is readable
277
- if not os.path.exists(filename):
278
- raise Exception("Chat history file was not created")
279
-
280
- if not os.access(filename, os.R_OK | os.W_OK):
281
- raise Exception(f"Insufficient permissions for chat history file")
282
-
283
- #st.caption("✅ Chat history saved successfully")
284
- st.toast("✅ Chat history saved", icon="💬")
285
 
286
  except Exception as e:
287
- error_msg = f"Failed to save chat history: {str(e)}"
288
- st.caption(error_msg)
289
- st.error(error_msg) # Also show as error message
290
  raise Exception(error_msg)
291
 
292
  # Main function
 
13
  import requests
14
  import json
15
  from datetime import datetime
16
+ from huggingface_hub import HfApi, upload_file, upload_folder, create_repo, Repository
17
+ from huggingface_hub.utils import RepositoryNotFoundError
18
+ import shutil
19
+
20
+ # Add these to your secrets or environment variables
21
+ try:
22
+ HF_TOKEN = st.secrets["HF_TOKEN"]
23
+ HF_USERNAME = "Rulga" # Your Hugging Face username
24
+ DATASET_NAME = "LS_chat" # Your dataset name
25
+ DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
26
+ except Exception as e:
27
+ st.error("Error loading HuggingFace credentials. Please check your configuration.")
28
+ st.stop()
29
 
30
  # Define base directory and absolute paths
31
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
230
  error_msg = f"Permission error: {str(e)} (Directory permissions: {permissions})"
231
  return False, error_msg
232
 
233
+ def sync_with_hf(local_path, repo_path, commit_message):
234
+ """Sync local files with Hugging Face dataset"""
235
  try:
236
+ api = HfApi()
 
 
 
237
 
238
+ # Create repo if it doesn't exist
239
+ try:
240
+ api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
241
+ except RepositoryNotFoundError:
242
+ create_repo(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
243
+
244
+ # Upload directory content
245
+ api.upload_folder(
246
+ folder_path=local_path,
247
+ path_in_repo=repo_path,
248
+ repo_id=DATASET_REPO,
249
+ repo_type="dataset",
250
+ commit_message=commit_message,
251
+ token=HF_TOKEN
252
+ )
253
+ st.toast(f"✅ Synchronized with Hugging Face: {repo_path}", icon="🤗")
254
+
255
+ except Exception as e:
256
+ error_msg = f"Failed to sync with Hugging Face: {str(e)}"
257
+ st.error(error_msg)
258
+ raise Exception(error_msg)
259
+
260
+ def force_save_vector_store(vector_store):
261
+ """Save vector store locally and sync with HF"""
262
+ try:
263
+ # Local save
264
  vector_store.save_local(VECTOR_STORE_PATH)
265
 
266
+ # Sync with HF
267
+ sync_with_hf(
268
+ local_path=VECTOR_STORE_PATH,
269
+ repo_path="vector_store",
270
+ commit_message=f"Update vector store: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
271
+ )
 
 
 
 
 
272
 
273
  except Exception as e:
274
+ error_msg = f"Failed to save vector store: {str(e)}"
275
+ st.error(error_msg)
 
276
  raise Exception(error_msg)
277
 
278
  def force_save_chat_history(chat_entry):
279
+ """Save chat history locally and sync with HF"""
280
  try:
 
 
 
 
 
281
  current_date = datetime.now().strftime("%Y-%m-%d")
282
  filename = os.path.join(CHAT_HISTORY_DIR, f"chat_history_{current_date}.json")
283
 
 
290
  # Add new entry
291
  existing_history.append(chat_entry)
292
 
293
+ # Save locally
294
  with open(filename, 'w', encoding='utf-8') as f:
295
  json.dump(existing_history, f, ensure_ascii=False, indent=2)
296
+
297
+ # Sync with HF
298
+ sync_with_hf(
299
+ local_path=CHAT_HISTORY_DIR,
300
+ repo_path="chat_history",
301
+ commit_message=f"Update chat history: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
302
+ )
 
 
 
 
 
 
303
 
304
  except Exception as e:
305
+ error_msg = f"Failed to save chat history: {str(e)}"
306
+ st.error(error_msg)
 
307
  raise Exception(error_msg)
308
 
309
  # Main function
init_dataset.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi, create_repo
3
+ import json
4
+
5
+ # Конфигурация
6
+ HF_TOKEN = "your_token_here" # Замените на ваш токен
7
+ HF_USERNAME = "Rulga"
8
+ DATASET_NAME = "LS_chat"
9
+ DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
10
+
11
+ # Создаем временную структуру
12
+ temp_dir = "temp_dataset"
13
+ os.makedirs(os.path.join(temp_dir, "chat_history"), exist_ok=True)
14
+ os.makedirs(os.path.join(temp_dir, "vector_store"), exist_ok=True)
15
+
16
+ # Создаем пустые .gitkeep файлы
17
+ with open(os.path.join(temp_dir, "chat_history", ".gitkeep"), "w") as f:
18
+ pass
19
+ with open(os.path.join(temp_dir, "vector_store", ".gitkeep"), "w") as f:
20
+ pass
21
+
22
+ # Создаем README.md с описанием структуры
23
+ readme_content = """
24
+ # LS Chat Dataset
25
+
26
+ This dataset contains chat history and vector store for the Status.Law Legal Assistant.
27
+
28
+ ## Structure
29
+
30
+ - `chat_history/`: Contains daily chat history files
31
+ - `vector_store/`: Contains FAISS vector store files
32
+
33
+ ## Usage
34
+
35
+ This dataset is automatically updated by the Status.Law Legal Assistant application.
36
+ """
37
+
38
+ with open(os.path.join(temp_dir, "README.md"), "w") as f:
39
+ f.write(readme_content)
40
+
41
+ # Инициализируем и загружаем на Hugging Face
42
+ try:
43
+ api = HfApi()
44
+
45
+ # Создаем репозиторий, если он не существует
46
+ try:
47
+ api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
48
+ print(f"Repository {DATASET_REPO} already exists")
49
+ except Exception:
50
+ create_repo(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
51
+ print(f"Created new repository {DATASET_REPO}")
52
+
53
+ # Загружаем структуру
54
+ api.upload_folder(
55
+ folder_path=temp_dir,
56
+ repo_id=DATASET_REPO,
57
+ repo_type="dataset",
58
+ commit_message="Initialize dataset structure",
59
+ token=HF_TOKEN
60
+ )
61
+ print("Successfully initialized dataset structure!")
62
+
63
+ except Exception as e:
64
+ print(f"Error: {str(e)}")
65
+
66
+ finally:
67
+ # Очищаем временные файлы
68
+ import shutil
69
+ shutil.rmtree(temp_dir)