leafmoes commited on
Commit
e60431a
·
1 Parent(s): f883aff

Update sync_data.sh

Browse files
Files changed (1) hide show
  1. sync_data.sh +240 -133
sync_data.sh CHANGED
@@ -1,144 +1,251 @@
1
- #!/bin/bash
2
 
3
- # 检查必要的WebDAV环境变量
4
- if [ -z "$WEBDAV_URL" ] || [ -z "$WEBDAV_USERNAME" ] || [ -z "$WEBDAV_PASSWORD" ]; then
5
- echo "缺少必要的WebDAV环境变量: WEBDAV_URL, WEBDAV_USERNAME 或 WEBDAV_PASSWORD"
6
- exit 1
7
- fi
8
-
9
- # 去除WEBDAV_URL末尾的斜杠(如果有的话)
10
- WEBDAV_URL=$(echo "$WEBDAV_URL" | sed 's/\/$//')
11
-
12
- # 创建数据目录
13
- mkdir -p ./data/webdav
14
 
15
- check_webdav_webui_latest_exists() {
16
- http_code=$(curl --user "$WEBDAV_USERNAME:$WEBDAV_PASSWORD" --head --silent --write-out "%{http_code}" "$WEBDAV_URL/OpenWebUI/webui_latest.db" -o /dev/null)
17
- if [ "$http_code" -eq 200 ]; then
18
- return 0 # 文件存在
19
- else
20
- return 1 # 文件不存在
21
- fi
22
- }
23
-
24
- # 从WebDAV拉取最新的webui_latest.db文件
25
- download_latest_db() {
26
- echo "检查云端是否存在webui_latest.db文件..."
27
-
28
- if check_webdav_webui_latest_exists; then
29
- echo "云端存在webui_latest.db文件,开始拉取..."
30
- curl -o ./data/webdav/webui_latest.db \
31
- --user "$WEBDAV_USERNAME:$WEBDAV_PASSWORD" \
32
- "$WEBDAV_URL/OpenWebUI/webui_latest.db" && {
33
- echo "最新的webui_latest.db文件拉取成功"
34
- cp ./data/webdav/webui_latest.db ./data/webui.db
35
- } || {
36
- echo "拉取webui_latest.db文件失败"
37
- }
38
- else
39
- echo "云端不存在webui_latest.db文件,跳过拉取"
40
- fi
41
- }
42
 
43
- # 比较两个文件的hash值
44
- compare_hashes() {
45
- local file1=$1
46
- local file2=$2
47
- local hash1
48
- local hash2
49
-
50
- # 检查文件是否存在
51
- if [ ! -f "$file1" ] || [ ! -f "$file2" ]; then
52
- echo "文件 $file1 或 $file2 不存在,无法比较哈希值"
53
- return 1
54
- fi
55
-
56
- # 计算文件的hash值
57
- hash1=$(sha256sum "$file1" | awk '{print $1}')
58
- hash2=$(sha256sum "$file2" | awk '{print $1}')
59
-
60
- # 比较hash值
61
- if [ "$hash1" == "$hash2" ]; then
62
- return 0 # 哈希值相同
63
- else
64
- return 1 # 哈希值不同
65
- fi
66
- }
67
 
68
- # 上传备份文件到WebDAV
69
- upload_to_webdav() {
70
- echo "检查是否需要上传文件到 WebDAV..."
71
-
72
- # 如果 ./data/webui.db 文件不存在,跳过上传
73
- if [ ! -f ./data/webui.db ]; then
74
- echo "数据库未初始化,跳过上传"
75
- return
76
- fi
77
-
78
- # 检查云端是否存在webui_latest.db文件
79
- if check_webdav_webui_latest_exists; then
80
- # 比较本地和云端的webui.db文件hash值
81
- if compare_hashes ./data/webui.db ./data/webdav/webui_latest.db; then
82
- echo "本地和云端的webui.db文件哈希值一致,跳过上传"
83
- return
84
- fi
85
- echo "本地和云端的webui.db文件哈希值不同,准备上传..."
86
- else
87
- echo "云端不存在webui_latest.db文件,直接上传文件"
88
- fi
89
-
90
- # 上传最新的webui_latest.db到 WebDAV
91
- cp ./data/webui.db ./data/webdav/webui_latest.db
92
- curl -T ./data/webdav/webui_latest.db --user "$WEBDAV_USERNAME:$WEBDAV_PASSWORD" "$WEBDAV_URL/OpenWebUI/webui_latest.db" && {
93
- echo "最新的webui_latest.db上传成功"
94
- } || {
95
- echo "上传webui_latest.db失败"
96
- }
97
-
98
- # 上传以日期命名的备份文件
99
- FILENAME="webui_$(date +'%m_%d').db"
100
- curl -T ./data/webdav/webui_latest.db --user "$WEBDAV_USERNAME:$WEBDAV_PASSWORD" "$WEBDAV_URL/OpenWebUI/$FILENAME" && {
101
- echo "备份文件 $FILENAME 上传成功"
102
- } || {
103
- echo "上传备份文件 $FILENAME 失败"
104
- }
105
-
106
- # 获取当前 WebDAV 目录中的所有备份文件,按时间排序
107
- echo "检查云端备份文件..."
108
- backup_files=$(curl -s --user "$WEBDAV_USERNAME:$WEBDAV_PASSWORD" "$WEBDAV_URL/OpenWebUI/" | grep -o 'webui_.*\.db' | grep -v 'webui_latest.db' | sort)
109
-
110
- # 如果备份文件超过 5 个,则删除最旧的一个
111
- backup_count=$(echo "$backup_files" | wc -l)
112
- if [ "$backup_count" -gt 5 ]; then
113
- oldest_file=$(echo "$backup_files" | head -n 1)
114
- echo "云端备份文件超过5个,删除最旧的备份文件: $oldest_file"
115
-
116
- curl -X DELETE --user "$WEBDAV_USERNAME:$WEBDAV_PASSWORD" "$WEBDAV_URL/OpenWebUI/$oldest_file" && {
117
- echo "删除最旧备份文件成功"
118
- } || {
119
- echo "删除最旧备份文件失败"
120
- }
121
- fi
122
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- # 定义同步函数
125
  sync_data() {
 
126
  while true; do
127
- echo "正在开始同步"
128
-
129
- # 上传文件到WebDAV
130
- upload_to_webdav
131
-
132
- # 等待时间间隔
133
- SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
134
- echo "当前时间 $(date '+%Y-%m-%d %H:%M:%S')"
135
- echo "等待 ${SYNC_INTERVAL} 秒后进行下一次同步..."
136
- sleep $SYNC_INTERVAL
137
  done
138
  }
139
 
140
- # 初始拉取最新的webui_latest.db文件
141
- download_latest_db
142
 
143
- # 后台启动同步进程
144
- sync_data &
 
1
+ #!/bin/sh
2
 
3
+ # 定义应用名称
4
+ APP_NAME="AiChat"
 
 
 
 
 
 
 
 
 
5
 
6
+ # 黑白名单配置(逗号分隔),先进行白名单过滤,然后在白名单的基础上进行黑名单过滤
7
+ WHITELIST="webui.db"
8
+ BLACKLIST=""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
11
+ echo "缺少必要的环境变量 HF_TOKEN 或 DATASET_ID"
12
+ exit 1
13
+ fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ mkdir -p "./data"
16
+ mkdir -p "/tmp/${APP_NAME}"
17
+
18
+ cat > /tmp/hf_sync.py << 'EOL'
19
+ import os
20
+ import sys
21
+ import hashlib
22
+ import shutil
23
+ from huggingface_hub import HfApi, CommitOperationAdd, CommitOperationDelete
24
+
25
+ def log_print(*args, prefix="[SyncData]", **kwargs):
26
+ print(prefix, *args, **kwargs)
27
+
28
+
29
+ def calculate_file_hash(file_path):
30
+ """
31
+ 计算文件的 MD5 哈希值。
32
+ """
33
+ try:
34
+ with open(file_path, 'rb') as f:
35
+ return hashlib.md5(f.read()).hexdigest()
36
+ except Exception as e:
37
+ log_print(f"Error calculating hash for {file_path}: {e}")
38
+ return None
39
+
40
+
41
+ def compare_and_sync_directories(source_dir, target_dir, whitelist=None, blacklist=None):
42
+ """
43
+ 比较 source_dir 和 target_dir 的文件哈希值。
44
+ 如果不一致,将 source_dir 的文件同步到 target_dir。
45
+ 返回需要上传的文件列表。
46
+ """
47
+ files_to_upload = []
48
+
49
+ def should_include_path(path, whitelist, blacklist):
50
+ """
51
+ 检查路径是否应包含在内。
52
+ """
53
+ if whitelist:
54
+ if not any(path.startswith(item.rstrip("/")) for item in whitelist):
55
+ return False
56
+ if blacklist:
57
+ if any(path.startswith(item.rstrip("/")) for item in blacklist):
58
+ return False
59
+ return True
60
+
61
+ def walk_and_filter(root_dir, rel_path="", whitelist=None, blacklist=None):
62
+ """
63
+ 遍历目录并根据黑白名单过滤文件。
64
+ """
65
+ full_path = os.path.join(root_dir, rel_path)
66
+ if not os.path.exists(full_path):
67
+ return []
68
+
69
+ filtered_files = []
70
+ try:
71
+ entries = os.listdir(full_path)
72
+ for entry in entries:
73
+ entry_rel_path = os.path.join(rel_path, entry)
74
+ entry_full_path = os.path.join(full_path, entry)
75
+
76
+ if not should_include_path(entry_rel_path, whitelist, blacklist):
77
+ continue
78
+
79
+ if os.path.isdir(entry_full_path):
80
+ filtered_files.extend(walk_and_filter(root_dir, entry_rel_path, whitelist, blacklist))
81
+ else:
82
+ filtered_files.append(entry_rel_path)
83
+ except Exception as e:
84
+ log_print(f"Error processing directory {full_path}: {e}")
85
+ return filtered_files
86
+
87
+ source_files = {}
88
+ if os.path.exists(source_dir):
89
+ filtered_source_files = walk_and_filter(source_dir, whitelist=whitelist, blacklist=blacklist)
90
+ for relative_path in filtered_source_files:
91
+ file_path = os.path.join(source_dir, relative_path)
92
+ file_hash = calculate_file_hash(file_path)
93
+ if file_hash is not None:
94
+ source_files[relative_path] = file_hash
95
+
96
+ target_files = {}
97
+ if os.path.exists(target_dir):
98
+ for root, _, files in os.walk(target_dir):
99
+ for file in files:
100
+ file_path = os.path.join(root, file)
101
+ relative_path = os.path.relpath(file_path, target_dir)
102
+ file_hash = calculate_file_hash(file_path)
103
+ if file_hash is not None:
104
+ target_files[relative_path] = file_hash
105
+
106
+ for relative_path in source_files.keys():
107
+ source_file_path = os.path.join(source_dir, relative_path)
108
+ target_file_path = os.path.join(target_dir, relative_path)
109
+ if relative_path not in target_files or target_files[relative_path] != source_files[relative_path]:
110
+ os.makedirs(os.path.dirname(target_file_path), exist_ok=True)
111
+ shutil.copy2(source_file_path, target_file_path)
112
+ files_to_upload.append(relative_path)
113
+
114
+ for relative_path in target_files.keys():
115
+ if relative_path not in source_files:
116
+ target_file_path = os.path.join(target_dir, relative_path)
117
+ os.remove(target_file_path)
118
+
119
+ return files_to_upload
120
+
121
+
122
+ def upload_files(api, repo_id, local_dir, remote_dir, files_to_upload, operations):
123
+ """
124
+ 上传本地文件到远程仓库。
125
+ """
126
+ for relative_path in files_to_upload:
127
+ local_file_path = os.path.join(local_dir, relative_path)
128
+ remote_file_path = os.path.join(remote_dir, relative_path)
129
+ operations.append(CommitOperationAdd(path_in_repo=remote_file_path, path_or_fileobj=local_file_path))
130
+
131
+
132
+ def delete_files(api, repo_id, remote_dir, files_to_delete, operations):
133
+ """
134
+ 删除远程仓库中的文件。
135
+ """
136
+ for relative_path in files_to_delete:
137
+ remote_file_path = os.path.join(remote_dir, relative_path)
138
+ operations.append(CommitOperationDelete(path_in_repo=remote_file_path))
139
+
140
+
141
+ def commit_operations(api, repo_id, operations, commit_message):
142
+ """
143
+ 统一处理 Hugging Face 的 Commit 操作。
144
+ """
145
+ try:
146
+ if operations:
147
+ api.create_commit(
148
+ repo_id=repo_id,
149
+ repo_type="dataset",
150
+ operations=operations,
151
+ commit_message=commit_message,
152
+ )
153
+ log_print("已成功更新云端版本!")
154
+ else:
155
+ log_print("当前版本已为最新,无需更新!")
156
+ except Exception as e:
157
+ log_print(f"更新提交失败: {str(e)}")
158
+
159
+
160
+ def download_files(api, repo_id, remote_dir, local_dir):
161
+ """
162
+ 从远程仓库下载文件到本地目录。
163
+ """
164
+ try:
165
+ remote_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
166
+ filtered_files = [file for file in remote_files if file.startswith(remote_dir)]
167
+
168
+ for remote_file in filtered_files:
169
+ relative_path = os.path.relpath(remote_file, remote_dir)
170
+ local_file_path = os.path.join(local_dir, relative_path)
171
+ os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
172
+ tmp_filepath = api.hf_hub_download(
173
+ repo_id=repo_id,
174
+ filename=remote_file,
175
+ repo_type="dataset",
176
+ )
177
+ if tmp_filepath and os.path.exists(tmp_filepath):
178
+ shutil.copy2(tmp_filepath, local_file_path)
179
+ shutil.copy2(tmp_filepath, f"/tmp/{remote_file}")
180
+
181
+ log_print(f"已下载{remote_file} -> {local_file_path}")
182
+
183
+ except Exception as e:
184
+ log_print(f"下载失败: {str(e)}")
185
+
186
+
187
+ def sync_repository(api, repo_id, remote_dir, whitelist=None, blacklist=None):
188
+ """
189
+ 同步本地与远程仓库(单次执行)。
190
+ """
191
+ log_print(f"开始数据同步进程...")
192
+ source_dir = "./data"
193
+ target_dir = f"/tmp/{remote_dir}"
194
+
195
+ files_to_upload = compare_and_sync_directories(source_dir, target_dir, whitelist, blacklist)
196
+
197
+ remote_files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
198
+ local_files = []
199
+ if os.path.exists(target_dir):
200
+ for root, _, files in os.walk(target_dir):
201
+ for file in files:
202
+ file_path = os.path.join(root, file)
203
+ relative_path = os.path.relpath(file_path, target_dir)
204
+ local_files.append(relative_path)
205
+ local_files_set = set(local_files)
206
+
207
+ files_to_delete = [
208
+ os.path.relpath(remote_file, remote_dir)
209
+ for remote_file in remote_files
210
+ if remote_file.startswith(remote_dir) and os.path.relpath(remote_file, remote_dir) not in local_files_set
211
+ ]
212
+
213
+ operations = []
214
+ upload_files(api, repo_id, source_dir, remote_dir, files_to_upload, operations)
215
+ delete_files(api, repo_id, remote_dir, files_to_delete, operations)
216
+
217
+ commit_operations(api, repo_id, operations, f"Sync repository: {remote_dir}")
218
+
219
+ if files_to_upload:
220
+ log_print(f"文件已上传: {files_to_upload}")
221
+ if files_to_delete:
222
+ log_print(f"文件已删除: {files_to_delete}")
223
+
224
+
225
+ if __name__ == "__main__":
226
+ action = sys.argv[1]
227
+ token = sys.argv[2]
228
+ repo_id = sys.argv[3]
229
+ remote_dir = sys.argv[4]
230
+ api = HfApi(token=token)
231
+ source_dir = "./data"
232
+
233
+ if action == "sync":
234
+ whitelist = sys.argv[5].split(",") if len(sys.argv) > 5 and sys.argv[5] not in ["", "None"] else None
235
+ blacklist = sys.argv[6].split(",") if len(sys.argv) > 6 and sys.argv[6] not in ["", "None"] else None
236
+ sync_repository(api, repo_id, remote_dir, whitelist, blacklist)
237
+ elif action == "download":
238
+ download_files(api, repo_id, remote_dir, source_dir)
239
+ EOL
240
 
 
241
  sync_data() {
242
+ SYNC_INTERVAL=${SYNC_INTERVAL:-7200} # 默认同步间隔为 7200 秒(2 小时)
243
  while true; do
244
+ python3 /tmp/hf_sync.py sync "${HF_TOKEN}" "${DATASET_ID}" "${APP_NAME}" "${WHITELIST}" "${BLACKLIST}"
245
+ sleep "${SYNC_INTERVAL}"
 
 
 
 
 
 
 
 
246
  done
247
  }
248
 
249
+ python3 /tmp/hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "${APP_NAME}"
 
250
 
251
+ sync_data &