oneapi / sync.py
Chenhao
添加了自动化提交脚本
e6a7d9d
raw
history blame
7.6 kB
import os
import time
import tarfile
import hashlib
import shutil
import argparse
import sys
from enum import Enum, auto
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
from contextlib import contextmanager
import logging
from dotenv import load_dotenv
from huggingface_hub import CommitScheduler, HfApi
class SyncMode(Enum):
INIT_ONLY = auto() # 只执行初始化
SYNC_ONLY = auto() # 只执行同步
BOTH = auto() # 执行初始化和同步
@dataclass
class Config:
repo_id: str
sync_interval: int
data_path: Path
sync_path: Path
tmp_path: Path
archive_name: str
@classmethod
def from_env(cls):
load_dotenv()
repo_id = os.getenv('HF_DATASET_REPO_ID')
if not repo_id:
raise ValueError("HF_DATASET_REPO_ID must be set")
return cls(
repo_id=repo_id,
sync_interval=int(os.getenv('SYNC_INTERVAL', '5')),
data_path=Path("/data"),
sync_path=Path("/sync"),
tmp_path=Path("/tmp/sync"),
archive_name="data.tar.gz"
)
class Logger:
def __init__(self):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
class DirectoryMonitor:
def __init__(self, path: Path):
self.path = path
self.last_hash: Optional[str] = None
def get_directory_hash(self) -> str:
sha256_hash = hashlib.sha256()
all_files = sorted(
str(p) for p in self.path.rglob('*') if p.is_file()
)
for file_path in all_files:
rel_path = os.path.relpath(file_path, self.path)
sha256_hash.update(rel_path.encode())
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
def has_changes(self) -> bool:
current_hash = self.get_directory_hash()
if current_hash != self.last_hash:
self.last_hash = current_hash
return True
return False
class ArchiveManager:
def __init__(self, config: Config, logger: Logger):
self.config = config
self.logger = logger.logger
@contextmanager
def safe_archive(self):
"""安全地创建归档文件的上下文管理器"""
self.config.tmp_path.mkdir(parents=True, exist_ok=True)
tmp_archive = self.config.tmp_path / self.config.archive_name
try:
with tarfile.open(tmp_archive, "w:gz") as tar:
yield tar
# 成功创建后移动到最终位置
self.config.sync_path.mkdir(parents=True, exist_ok=True)
shutil.move(tmp_archive, self.config.sync_path / self.config.archive_name)
finally:
# 清理临时文件
if tmp_archive.exists():
tmp_archive.unlink()
def create_archive(self):
"""创建压缩包"""
self.logger.info("Creating new archive...")
with self.safe_archive() as tar:
tar.add(self.config.data_path, arcname="data")
self.logger.info("Archive created")
def extract_archive(self):
"""解压现有数据"""
api = HfApi()
try:
self.logger.info("Downloading data archive...")
api.hf_hub_download(
repo_id=self.config.repo_id,
filename=self.config.archive_name,
repo_type="dataset",
local_dir=self.config.sync_path
)
self.logger.info("Extracting archive...")
archive_path = self.config.sync_path / self.config.archive_name
with tarfile.open(archive_path, "r:gz") as tar:
tar.extractall(
path=self.config.data_path,
filter=self._tar_filter
)
return True
except Exception as e:
self.logger.error(f"No existing archive found or download failed: {e}")
self.config.data_path.mkdir(parents=True, exist_ok=True)
return False
@staticmethod
def _tar_filter(tarinfo, path):
"""tar 文件过滤器"""
if tarinfo.name.startswith('data/'):
tarinfo.name = tarinfo.name[5:]
return tarinfo
return None
class SyncService:
def __init__(self, config: Config, logger: Logger):
self.config = config
self.logger = logger.logger
self.monitor = DirectoryMonitor(config.data_path)
self.archive_manager = ArchiveManager(config, logger)
def init(self) -> bool:
"""
执行初始化操作
返回: 是否成功初始化
"""
try:
self.logger.info("Starting initialization...")
self.config.sync_path.mkdir(parents=True, exist_ok=True)
success = self.archive_manager.extract_archive()
if success:
self.logger.info("Initialization completed successfully")
else:
self.logger.warning("Initialization completed with warnings")
return success
except Exception as e:
self.logger.error(f"Initialization failed: {e}")
return False
def sync(self):
"""执行持续同步操作"""
self.logger.info(f"Starting sync process for repo: {self.config.repo_id}")
self.logger.info(f"Sync interval: {self.config.sync_interval} minutes")
scheduler = CommitScheduler(
repo_id=self.config.repo_id,
repo_type="dataset",
folder_path=str(self.config.sync_path),
path_in_repo="",
every=self.config.sync_interval,
squash_history=True,
private=True
)
try:
while True:
if self.monitor.has_changes():
self.logger.info("Directory changes detected, creating new archive...")
self.archive_manager.create_archive()
else:
self.logger.info("No changes detected")
self.logger.info(f"Waiting {self.config.sync_interval} minutes until next check...")
time.sleep(self.config.sync_interval * 60)
except KeyboardInterrupt:
self.logger.info("Stopping sync process...")
scheduler.stop()
def parse_args():
parser = argparse.ArgumentParser(description='Data synchronization service')
parser.add_argument(
'--mode',
type=str,
choices=['init', 'sync', 'both'],
default='both',
help='Operation mode: init (initialization only), sync (synchronization only), both (default)'
)
return parser.parse_args()
def main():
args = parse_args()
config = Config.from_env()
logger = Logger()
service = SyncService(config, logger)
mode = {
'init': SyncMode.INIT_ONLY,
'sync': SyncMode.SYNC_ONLY,
'both': SyncMode.BOTH
}[args.mode]
if mode in (SyncMode.INIT_ONLY, SyncMode.BOTH):
success = service.init()
if not success:
sys.exit(1)
if mode == SyncMode.INIT_ONLY:
return
if mode in (SyncMode.SYNC_ONLY, SyncMode.BOTH):
service.sync()
if __name__ == "__main__":
main()