Spaces:
Sleeping
Sleeping
feat: initialize dataset structure with chat history and vector store
Browse files- app.py +61 -44
- init_dataset.py +69 -0
app.py
CHANGED
@@ -13,6 +13,19 @@ from langchain_core.runnables import RunnableLambda
|
|
13 |
import requests
|
14 |
import json
|
15 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# Define base directory and absolute paths
|
18 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
@@ -217,43 +230,54 @@ def check_directory_permissions(directory):
|
|
217 |
error_msg = f"Permission error: {str(e)} (Directory permissions: {permissions})"
|
218 |
return False, error_msg
|
219 |
|
220 |
-
def
|
221 |
-
"""
|
222 |
try:
|
223 |
-
|
224 |
-
success, error_msg = check_directory_permissions(VECTOR_STORE_PATH)
|
225 |
-
if not success:
|
226 |
-
raise Exception(error_msg)
|
227 |
|
228 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
vector_store.save_local(VECTOR_STORE_PATH)
|
230 |
|
231 |
-
#
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
if not os.access(index_file, os.R_OK | os.W_OK):
|
238 |
-
raise Exception(f"Insufficient permissions for vector store files")
|
239 |
-
|
240 |
-
#st.caption("✅ Vector store saved successfully")
|
241 |
-
st.toast("✅ Vector store saved", icon="💾")
|
242 |
|
243 |
except Exception as e:
|
244 |
-
error_msg = f"
|
245 |
-
st.
|
246 |
-
st.error(error_msg) # Also show as error message
|
247 |
raise Exception(error_msg)
|
248 |
|
249 |
def force_save_chat_history(chat_entry):
|
250 |
-
"""
|
251 |
try:
|
252 |
-
# Check directory permissions
|
253 |
-
success, error_msg = check_directory_permissions(CHAT_HISTORY_DIR)
|
254 |
-
if not success:
|
255 |
-
raise Exception(error_msg)
|
256 |
-
|
257 |
current_date = datetime.now().strftime("%Y-%m-%d")
|
258 |
filename = os.path.join(CHAT_HISTORY_DIR, f"chat_history_{current_date}.json")
|
259 |
|
@@ -266,27 +290,20 @@ def force_save_chat_history(chat_entry):
|
|
266 |
# Add new entry
|
267 |
existing_history.append(chat_entry)
|
268 |
|
269 |
-
# Save
|
270 |
with open(filename, 'w', encoding='utf-8') as f:
|
271 |
json.dump(existing_history, f, ensure_ascii=False, indent=2)
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
if not os.access(filename, os.R_OK | os.W_OK):
|
281 |
-
raise Exception(f"Insufficient permissions for chat history file")
|
282 |
-
|
283 |
-
#st.caption("✅ Chat history saved successfully")
|
284 |
-
st.toast("✅ Chat history saved", icon="💬")
|
285 |
|
286 |
except Exception as e:
|
287 |
-
error_msg = f"
|
288 |
-
st.
|
289 |
-
st.error(error_msg) # Also show as error message
|
290 |
raise Exception(error_msg)
|
291 |
|
292 |
# Main function
|
|
|
13 |
import requests
|
14 |
import json
|
15 |
from datetime import datetime
|
16 |
+
from huggingface_hub import HfApi, upload_file, upload_folder, create_repo, Repository
|
17 |
+
from huggingface_hub.utils import RepositoryNotFoundError
|
18 |
+
import shutil
|
19 |
+
|
20 |
+
# Add these to your secrets or environment variables
|
21 |
+
try:
|
22 |
+
HF_TOKEN = st.secrets["HF_TOKEN"]
|
23 |
+
HF_USERNAME = "Rulga" # Your Hugging Face username
|
24 |
+
DATASET_NAME = "LS_chat" # Your dataset name
|
25 |
+
DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
|
26 |
+
except Exception as e:
|
27 |
+
st.error("Error loading HuggingFace credentials. Please check your configuration.")
|
28 |
+
st.stop()
|
29 |
|
30 |
# Define base directory and absolute paths
|
31 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
230 |
error_msg = f"Permission error: {str(e)} (Directory permissions: {permissions})"
|
231 |
return False, error_msg
|
232 |
|
233 |
+
def sync_with_hf(local_path, repo_path, commit_message):
|
234 |
+
"""Sync local files with Hugging Face dataset"""
|
235 |
try:
|
236 |
+
api = HfApi()
|
|
|
|
|
|
|
237 |
|
238 |
+
# Create repo if it doesn't exist
|
239 |
+
try:
|
240 |
+
api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
|
241 |
+
except RepositoryNotFoundError:
|
242 |
+
create_repo(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
|
243 |
+
|
244 |
+
# Upload directory content
|
245 |
+
api.upload_folder(
|
246 |
+
folder_path=local_path,
|
247 |
+
path_in_repo=repo_path,
|
248 |
+
repo_id=DATASET_REPO,
|
249 |
+
repo_type="dataset",
|
250 |
+
commit_message=commit_message,
|
251 |
+
token=HF_TOKEN
|
252 |
+
)
|
253 |
+
st.toast(f"✅ Synchronized with Hugging Face: {repo_path}", icon="🤗")
|
254 |
+
|
255 |
+
except Exception as e:
|
256 |
+
error_msg = f"Failed to sync with Hugging Face: {str(e)}"
|
257 |
+
st.error(error_msg)
|
258 |
+
raise Exception(error_msg)
|
259 |
+
|
260 |
+
def force_save_vector_store(vector_store):
|
261 |
+
"""Save vector store locally and sync with HF"""
|
262 |
+
try:
|
263 |
+
# Local save
|
264 |
vector_store.save_local(VECTOR_STORE_PATH)
|
265 |
|
266 |
+
# Sync with HF
|
267 |
+
sync_with_hf(
|
268 |
+
local_path=VECTOR_STORE_PATH,
|
269 |
+
repo_path="vector_store",
|
270 |
+
commit_message=f"Update vector store: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
271 |
+
)
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
except Exception as e:
|
274 |
+
error_msg = f"Failed to save vector store: {str(e)}"
|
275 |
+
st.error(error_msg)
|
|
|
276 |
raise Exception(error_msg)
|
277 |
|
278 |
def force_save_chat_history(chat_entry):
|
279 |
+
"""Save chat history locally and sync with HF"""
|
280 |
try:
|
|
|
|
|
|
|
|
|
|
|
281 |
current_date = datetime.now().strftime("%Y-%m-%d")
|
282 |
filename = os.path.join(CHAT_HISTORY_DIR, f"chat_history_{current_date}.json")
|
283 |
|
|
|
290 |
# Add new entry
|
291 |
existing_history.append(chat_entry)
|
292 |
|
293 |
+
# Save locally
|
294 |
with open(filename, 'w', encoding='utf-8') as f:
|
295 |
json.dump(existing_history, f, ensure_ascii=False, indent=2)
|
296 |
+
|
297 |
+
# Sync with HF
|
298 |
+
sync_with_hf(
|
299 |
+
local_path=CHAT_HISTORY_DIR,
|
300 |
+
repo_path="chat_history",
|
301 |
+
commit_message=f"Update chat history: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
302 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
except Exception as e:
|
305 |
+
error_msg = f"Failed to save chat history: {str(e)}"
|
306 |
+
st.error(error_msg)
|
|
|
307 |
raise Exception(error_msg)
|
308 |
|
309 |
# Main function
|
init_dataset.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from huggingface_hub import HfApi, create_repo
|
3 |
+
import json
|
4 |
+
|
5 |
+
# Конфигурация
|
6 |
+
HF_TOKEN = "your_token_here" # Замените на ваш токен
|
7 |
+
HF_USERNAME = "Rulga"
|
8 |
+
DATASET_NAME = "LS_chat"
|
9 |
+
DATASET_REPO = f"{HF_USERNAME}/{DATASET_NAME}"
|
10 |
+
|
11 |
+
# Создаем временную структуру
|
12 |
+
temp_dir = "temp_dataset"
|
13 |
+
os.makedirs(os.path.join(temp_dir, "chat_history"), exist_ok=True)
|
14 |
+
os.makedirs(os.path.join(temp_dir, "vector_store"), exist_ok=True)
|
15 |
+
|
16 |
+
# Создаем пустые .gitkeep файлы
|
17 |
+
with open(os.path.join(temp_dir, "chat_history", ".gitkeep"), "w") as f:
|
18 |
+
pass
|
19 |
+
with open(os.path.join(temp_dir, "vector_store", ".gitkeep"), "w") as f:
|
20 |
+
pass
|
21 |
+
|
22 |
+
# Создаем README.md с описанием структуры
|
23 |
+
readme_content = """
|
24 |
+
# LS Chat Dataset
|
25 |
+
|
26 |
+
This dataset contains chat history and vector store for the Status.Law Legal Assistant.
|
27 |
+
|
28 |
+
## Structure
|
29 |
+
|
30 |
+
- `chat_history/`: Contains daily chat history files
|
31 |
+
- `vector_store/`: Contains FAISS vector store files
|
32 |
+
|
33 |
+
## Usage
|
34 |
+
|
35 |
+
This dataset is automatically updated by the Status.Law Legal Assistant application.
|
36 |
+
"""
|
37 |
+
|
38 |
+
with open(os.path.join(temp_dir, "README.md"), "w") as f:
|
39 |
+
f.write(readme_content)
|
40 |
+
|
41 |
+
# Инициализируем и загружаем на Hugging Face
|
42 |
+
try:
|
43 |
+
api = HfApi()
|
44 |
+
|
45 |
+
# Создаем репозиторий, если он не существует
|
46 |
+
try:
|
47 |
+
api.repo_info(repo_id=DATASET_REPO, repo_type="dataset")
|
48 |
+
print(f"Repository {DATASET_REPO} already exists")
|
49 |
+
except Exception:
|
50 |
+
create_repo(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
|
51 |
+
print(f"Created new repository {DATASET_REPO}")
|
52 |
+
|
53 |
+
# Загружаем структуру
|
54 |
+
api.upload_folder(
|
55 |
+
folder_path=temp_dir,
|
56 |
+
repo_id=DATASET_REPO,
|
57 |
+
repo_type="dataset",
|
58 |
+
commit_message="Initialize dataset structure",
|
59 |
+
token=HF_TOKEN
|
60 |
+
)
|
61 |
+
print("Successfully initialized dataset structure!")
|
62 |
+
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error: {str(e)}")
|
65 |
+
|
66 |
+
finally:
|
67 |
+
# Очищаем временные файлы
|
68 |
+
import shutil
|
69 |
+
shutil.rmtree(temp_dir)
|