Chenhao commited on
Commit
5a97508
·
1 Parent(s): 81f570c
Files changed (6) hide show
  1. .gitignore +17 -0
  2. Dockerfile +24 -0
  3. build.sh +6 -0
  4. requirements.txt +6 -0
  5. start.sh +10 -0
  6. sync.py +100 -0
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # Ignore Python Caches
4
+ __pycache__
5
+ *.pyc
6
+
7
+ # Ignore Enviroment Settings
8
+ .env
9
+ .venv
10
+
11
+ # Ignore Local Databases
12
+ *.db
13
+
14
+
15
+
16
+
17
+
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ARG HF_TOKEN
3
+ ARG HF_REPO_ID
4
+ ARG SYNC_INTERVAL
5
+
6
+ # 基于 alpine linux, 需要 /bin/ash
7
+ FROM justsong/one-api
8
+
9
+ # 从环境变量中设置端口
10
+ ENV PORT=7860
11
+
12
+ RUN mkdir /data/logs
13
+ RUN chmod -R 777 /data
14
+
15
+ WORKDIR /app
16
+ RUN chmod -R 777 /app
17
+ COPY . .
18
+ RUN apk update && \
19
+ apk add --no-cache python3 py3-pip && \
20
+ pip install -r requirements.txt --break-system-packages
21
+
22
+
23
+ WORKDIR /data
24
+ ENTRYPOINT ["/bin/sh", "/app/start.sh"]
build.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ set -ex
3
+
4
+ docker build -t one-api .
5
+ docker run --rm -it -e PORT=7860 -p 7860:7860 one-api
6
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+
3
+ fastapi
4
+ python-dotenv
5
+ huggingface_hub
6
+ uvicorn[standard]
start.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ set -ex
4
+
5
+ python3 /app/sync.py &
6
+
7
+ sleep 3
8
+
9
+ /one-api
10
+
sync.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import tarfile
4
+ from dotenv import load_dotenv
5
+ from huggingface_hub import CommitScheduler, HfApi
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ # 加载环境变量
10
+ load_dotenv()
11
+
12
+ # 全局配置变量
13
+ REPO_ID = os.getenv('HF_REPO_ID')
14
+ SYNC_INTERVAL = int(os.getenv('SYNC_INTERVAL', 5))
15
+ DATA_PATH = "/data"
16
+ ARCHIVE_NAME = "data.tar.gz"
17
+ SYNC_PATH = "/sync" # CommitScheduler 监控的目录
18
+ ARCHIVE_PATH = f"{SYNC_PATH}/{ARCHIVE_NAME}"
19
+
20
+ # 配置日志
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # 环境变量检查
28
+ if not REPO_ID:
29
+ raise ValueError("HF_REPO_ID must be set in environment variables")
30
+
31
+ def tar_filter(tarinfo):
32
+ """tar 文件过滤器"""
33
+ if tarinfo.name.startswith('data/'):
34
+ tarinfo.name = tarinfo.name[5:]
35
+ return tarinfo
36
+
37
+ def download_and_extract():
38
+ """下载并解压数据"""
39
+ api = HfApi()
40
+ try:
41
+ # 下载压缩包
42
+ logger.info("Downloading data archive...")
43
+ api.hf_hub_download(
44
+ repo_id=REPO_ID,
45
+ filename=ARCHIVE_NAME,
46
+ repo_type="dataset",
47
+ local_dir=SYNC_PATH
48
+ )
49
+
50
+ # 解压到 data 目录
51
+ logger.info("Extracting archive...")
52
+ with tarfile.open(ARCHIVE_PATH, "r:gz") as tar:
53
+ tar.extractall(path=DATA_PATH, filter=tar_filter)
54
+
55
+ except Exception as e:
56
+ logger.info(f"No existing archive found or download failed: {e}")
57
+ # 确保目录存在
58
+ Path(DATA_PATH).mkdir(parents=True, exist_ok=True)
59
+
60
+ def create_archive():
61
+ """创建压缩包"""
62
+ logger.info("Creating new archive...")
63
+ with tarfile.open(ARCHIVE_PATH, "w:gz") as tar:
64
+ tar.add(DATA_PATH, arcname="data")
65
+ logger.info("Archive created")
66
+
67
+ def main():
68
+ logger.info(f"Starting sync process for repo: {REPO_ID}")
69
+ logger.info(f"Sync interval: {SYNC_INTERVAL} minutes")
70
+
71
+ # 创建同步目录
72
+ Path(SYNC_PATH).mkdir(parents=True, exist_ok=True)
73
+
74
+ # 初始下载并解压
75
+ download_and_extract()
76
+
77
+ # 创建调度器
78
+ scheduler = CommitScheduler(
79
+ repo_id=REPO_ID,
80
+ repo_type="dataset",
81
+ folder_path=SYNC_PATH,
82
+ path_in_repo="", # 直接将压缩包放在根目录
83
+ every=SYNC_INTERVAL,
84
+ squash_history=True,
85
+ private=True
86
+ )
87
+
88
+ # 主循环:定期创建新的压缩包
89
+ try:
90
+ while True:
91
+ create_archive() # 创建新的压缩包
92
+ # 等待下一次同步
93
+ logger.info(f"Waiting {SYNC_INTERVAL} minutes until next sync...")
94
+ time.sleep(SYNC_INTERVAL * 60)
95
+ except KeyboardInterrupt:
96
+ logger.info("Stopping sync process...")
97
+ scheduler.stop()
98
+
99
+ if __name__ == "__main__":
100
+ main()