oneapi / sync.py
Chenhao
一个可以正常同步的版本 但是在网络异常时 同步可能有些问题。
fca1eb6
raw
history blame
6.03 kB
import os
import time
import tarfile
import hashlib
import shutil
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
from contextlib import contextmanager
import logging
from dotenv import load_dotenv
from huggingface_hub import CommitScheduler, HfApi
@dataclass
class Config:
repo_id: str
sync_interval: int
data_path: Path
sync_path: Path
tmp_path: Path
archive_name: str
@classmethod
def from_env(cls):
load_dotenv()
repo_id = os.getenv('HF_REPO_ID')
if not repo_id:
raise ValueError("HF_REPO_ID must be set")
return cls(
repo_id=repo_id,
sync_interval=int(os.getenv('SYNC_INTERVAL', '5')),
data_path=Path("/data"),
sync_path=Path("/sync"),
tmp_path=Path("/tmp/sync"),
archive_name="data.tar.gz"
)
class Logger:
def __init__(self):
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
class DirectoryMonitor:
def __init__(self, path: Path):
self.path = path
self.last_hash: Optional[str] = None
def get_directory_hash(self) -> str:
sha256_hash = hashlib.sha256()
all_files = sorted(
str(p) for p in self.path.rglob('*') if p.is_file()
)
for file_path in all_files:
rel_path = os.path.relpath(file_path, self.path)
sha256_hash.update(rel_path.encode())
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
def has_changes(self) -> bool:
current_hash = self.get_directory_hash()
if current_hash != self.last_hash:
self.last_hash = current_hash
return True
return False
class ArchiveManager:
def __init__(self, config: Config, logger: Logger):
self.config = config
self.logger = logger.logger
@contextmanager
def safe_archive(self):
"""安全地创建归档文件的上下文管理器"""
self.config.tmp_path.mkdir(parents=True, exist_ok=True)
tmp_archive = self.config.tmp_path / self.config.archive_name
try:
with tarfile.open(tmp_archive, "w:gz") as tar:
yield tar
# 成功创建后移动到最终位置
self.config.sync_path.mkdir(parents=True, exist_ok=True)
shutil.move(tmp_archive, self.config.sync_path / self.config.archive_name)
finally:
# 清理临时文件
if tmp_archive.exists():
tmp_archive.unlink()
def create_archive(self):
"""创建压缩包"""
self.logger.info("Creating new archive...")
with self.safe_archive() as tar:
tar.add(self.config.data_path, arcname="data")
self.logger.info("Archive created")
def extract_archive(self):
"""解压现有数据"""
api = HfApi()
try:
self.logger.info("Downloading data archive...")
api.hf_hub_download(
repo_id=self.config.repo_id,
filename=self.config.archive_name,
repo_type="dataset",
local_dir=self.config.sync_path
)
self.logger.info("Extracting archive...")
archive_path = self.config.sync_path / self.config.archive_name
with tarfile.open(archive_path, "r:gz") as tar:
tar.extractall(
path=self.config.data_path,
filter=self._tar_filter
)
except Exception as e:
self.logger.info(f"No existing archive found or download failed: {e}")
self.config.data_path.mkdir(parents=True, exist_ok=True)
@staticmethod
def _tar_filter(tarinfo, path):
"""tar 文件过滤器"""
if tarinfo.name.startswith('data/'):
tarinfo.name = tarinfo.name[5:]
return tarinfo
else:
return None
class SyncService:
def __init__(self, config: Config, logger: Logger):
self.config = config
self.logger = logger.logger
self.monitor = DirectoryMonitor(config.data_path)
self.archive_manager = ArchiveManager(config, logger)
def run(self):
self.logger.info(f"Starting sync process for repo: {self.config.repo_id}")
self.logger.info(f"Sync interval: {self.config.sync_interval} minutes")
# 初始化目录和下载数据
self.config.sync_path.mkdir(parents=True, exist_ok=True)
self.archive_manager.extract_archive()
scheduler = CommitScheduler(
repo_id=self.config.repo_id,
repo_type="dataset",
folder_path=str(self.config.sync_path),
path_in_repo="",
every=self.config.sync_interval,
squash_history=True,
private=True
)
try:
while True:
if self.monitor.has_changes():
self.logger.info("Directory changes detected, creating new archive...")
self.archive_manager.create_archive()
else:
self.logger.info("No changes detected")
self.logger.info(f"Waiting {self.config.sync_interval} minutes until next check...")
time.sleep(self.config.sync_interval * 60)
except KeyboardInterrupt:
self.logger.info("Stopping sync process...")
scheduler.stop()
def main():
config = Config.from_env()
logger = Logger()
service = SyncService(config, logger)
service.run()
if __name__ == "__main__":
main()