Chenhao
commited on
Commit
·
5a97508
1
Parent(s):
81f570c
init
Browse files- .gitignore +17 -0
- Dockerfile +24 -0
- build.sh +6 -0
- requirements.txt +6 -0
- start.sh +10 -0
- sync.py +100 -0
.gitignore
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
# Ignore Python Caches
|
4 |
+
__pycache__
|
5 |
+
*.pyc
|
6 |
+
|
7 |
+
# Ignore Enviroment Settings
|
8 |
+
.env
|
9 |
+
.venv
|
10 |
+
|
11 |
+
# Ignore Local Databases
|
12 |
+
*.db
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
|
Dockerfile
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
ARG HF_TOKEN
|
3 |
+
ARG HF_REPO_ID
|
4 |
+
ARG SYNC_INTERVAL
|
5 |
+
|
6 |
+
# 基于 alpine linux, 需要 /bin/ash
|
7 |
+
FROM justsong/one-api
|
8 |
+
|
9 |
+
# 从环境变量中设置端口
|
10 |
+
ENV PORT=7860
|
11 |
+
|
12 |
+
RUN mkdir /data/logs
|
13 |
+
RUN chmod -R 777 /data
|
14 |
+
|
15 |
+
WORKDIR /app
|
16 |
+
RUN chmod -R 777 /app
|
17 |
+
COPY . .
|
18 |
+
RUN apk update && \
|
19 |
+
apk add --no-cache python3 py3-pip && \
|
20 |
+
pip install -r requirements.txt --break-system-packages
|
21 |
+
|
22 |
+
|
23 |
+
WORKDIR /data
|
24 |
+
ENTRYPOINT ["/bin/sh", "/app/start.sh"]
|
build.sh
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
set -ex
|
3 |
+
|
4 |
+
docker build -t one-api .
|
5 |
+
docker run --rm -it -e PORT=7860 -p 7860:7860 one-api
|
6 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
fastapi
|
4 |
+
python-dotenv
|
5 |
+
huggingface_hub
|
6 |
+
uvicorn[standard]
|
start.sh
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
set -ex
|
4 |
+
|
5 |
+
python3 /app/sync.py &
|
6 |
+
|
7 |
+
sleep 3
|
8 |
+
|
9 |
+
/one-api
|
10 |
+
|
sync.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import tarfile
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
from huggingface_hub import CommitScheduler, HfApi
|
6 |
+
import logging
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
# 加载环境变量
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
# 全局配置变量
|
13 |
+
REPO_ID = os.getenv('HF_REPO_ID')
|
14 |
+
SYNC_INTERVAL = int(os.getenv('SYNC_INTERVAL', 5))
|
15 |
+
DATA_PATH = "/data"
|
16 |
+
ARCHIVE_NAME = "data.tar.gz"
|
17 |
+
SYNC_PATH = "/sync" # CommitScheduler 监控的目录
|
18 |
+
ARCHIVE_PATH = f"{SYNC_PATH}/{ARCHIVE_NAME}"
|
19 |
+
|
20 |
+
# 配置日志
|
21 |
+
logging.basicConfig(
|
22 |
+
level=logging.INFO,
|
23 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
24 |
+
)
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
|
27 |
+
# 环境变量检查
|
28 |
+
if not REPO_ID:
|
29 |
+
raise ValueError("HF_REPO_ID must be set in environment variables")
|
30 |
+
|
31 |
+
def tar_filter(tarinfo):
|
32 |
+
"""tar 文件过滤器"""
|
33 |
+
if tarinfo.name.startswith('data/'):
|
34 |
+
tarinfo.name = tarinfo.name[5:]
|
35 |
+
return tarinfo
|
36 |
+
|
37 |
+
def download_and_extract():
|
38 |
+
"""下载并解压数据"""
|
39 |
+
api = HfApi()
|
40 |
+
try:
|
41 |
+
# 下载压缩包
|
42 |
+
logger.info("Downloading data archive...")
|
43 |
+
api.hf_hub_download(
|
44 |
+
repo_id=REPO_ID,
|
45 |
+
filename=ARCHIVE_NAME,
|
46 |
+
repo_type="dataset",
|
47 |
+
local_dir=SYNC_PATH
|
48 |
+
)
|
49 |
+
|
50 |
+
# 解压到 data 目录
|
51 |
+
logger.info("Extracting archive...")
|
52 |
+
with tarfile.open(ARCHIVE_PATH, "r:gz") as tar:
|
53 |
+
tar.extractall(path=DATA_PATH, filter=tar_filter)
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.info(f"No existing archive found or download failed: {e}")
|
57 |
+
# 确保目录存在
|
58 |
+
Path(DATA_PATH).mkdir(parents=True, exist_ok=True)
|
59 |
+
|
60 |
+
def create_archive():
|
61 |
+
"""创建压缩包"""
|
62 |
+
logger.info("Creating new archive...")
|
63 |
+
with tarfile.open(ARCHIVE_PATH, "w:gz") as tar:
|
64 |
+
tar.add(DATA_PATH, arcname="data")
|
65 |
+
logger.info("Archive created")
|
66 |
+
|
67 |
+
def main():
|
68 |
+
logger.info(f"Starting sync process for repo: {REPO_ID}")
|
69 |
+
logger.info(f"Sync interval: {SYNC_INTERVAL} minutes")
|
70 |
+
|
71 |
+
# 创建同步目录
|
72 |
+
Path(SYNC_PATH).mkdir(parents=True, exist_ok=True)
|
73 |
+
|
74 |
+
# 初始下载并解压
|
75 |
+
download_and_extract()
|
76 |
+
|
77 |
+
# 创建调度器
|
78 |
+
scheduler = CommitScheduler(
|
79 |
+
repo_id=REPO_ID,
|
80 |
+
repo_type="dataset",
|
81 |
+
folder_path=SYNC_PATH,
|
82 |
+
path_in_repo="", # 直接将压缩包放在根目录
|
83 |
+
every=SYNC_INTERVAL,
|
84 |
+
squash_history=True,
|
85 |
+
private=True
|
86 |
+
)
|
87 |
+
|
88 |
+
# 主循环:定期创建新的压缩包
|
89 |
+
try:
|
90 |
+
while True:
|
91 |
+
create_archive() # 创建新的压缩包
|
92 |
+
# 等待下一次同步
|
93 |
+
logger.info(f"Waiting {SYNC_INTERVAL} minutes until next sync...")
|
94 |
+
time.sleep(SYNC_INTERVAL * 60)
|
95 |
+
except KeyboardInterrupt:
|
96 |
+
logger.info("Stopping sync process...")
|
97 |
+
scheduler.stop()
|
98 |
+
|
99 |
+
if __name__ == "__main__":
|
100 |
+
main()
|