import asyncio import base64 import datetime import json import logging import os import re import shutil import subprocess import time from typing import Optional import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent from fastapi import FastAPI from huggingface_hub import HfApi, create_repo, hf_hub_download, login logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class Config: """設定用のクラス""" HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"] CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"] LOG_FILE = "civitai_backup.log" LIST_FILE = "model_list.log" REPO_IDS = { "log": "ttttdiva/CivitAI_log_test", "model_list": "ttttdiva/CivitAI_model_info_test", "current": "" } URLS = { "latest": "https://civitai.com/api/v1/models?sort=Newest", "modelPage": "https://civitai.com/models/", "modelId": "https://civitai.com/api/v1/models/", "modelVersionId": "https://civitai.com/api/v1/model-versions/", "hash": "https://civitai.com/api/v1/model-versions/by-hash/" } JST = datetime.timezone(datetime.timedelta(hours=9)) UA = UserAgent() HEADERS = { 'Authorization': f'Bearer {CIVITAI_API_TOKEN}', 'User-Agent': 'civitai-crawler/1.0', "Content-Type": "application/json" } # rclone 用の追加設定 RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "") ENCRYPTED_DIR = "/home/user/app/encrypted" class CivitAICrawler: """CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス""" def __init__(self, config: Config): self.config = config self.api = HfApi() self.app = FastAPI() self.repo_ids = self.config.REPO_IDS.copy() self.jst = self.config.JST # rclone 設定の読み込み self.setup_rclone_conf() self.setup_routes() def setup_routes(self): @self.app.get("/") def read_root(): now = str(datetime.datetime.now(self.jst)) description = ( f"CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。\n" f"model_list.log や civitai_backup.log は暗号化しないでアップロードします。\n" f"モデルのフォルダやファイルは暗号化してアップロードします。\n" f"Status: {now} + currently running :D\n" ) return description @self.app.on_event("startup") async def startup_event(): asyncio.create_task(self.crawl()) # ============================================================================ # rclone 設定 & 暗号化アップロード処理 # ============================================================================ def setup_rclone_conf(self): if not self.config.RCLONE_CONF_BASE64: logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.") return os.makedirs(".rclone_config", exist_ok=True) conf_path = os.path.join(".rclone_config", "rclone.conf") with open(conf_path, "wb") as f: f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64)) os.environ["RCLONE_CONFIG"] = conf_path logger.info(f"[INFO] rclone.conf created at: {conf_path}") def encrypt_with_rclone(self, local_path: str): """フォルダ or ファイルを cryptLocal: にコピーし、フォルダ名・ファイル名を暗号化""" if not os.path.exists(local_path): raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}") # 事前に暗号先ディレクトリを掃除 if os.path.isdir(self.config.ENCRYPTED_DIR): shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True) top_level_name = os.path.basename(local_path.rstrip("/")) or "unnamed" cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"] logger.info(f"[INFO] Running: {' '.join(cmd)}") subprocess.run(cmd, check=True) logger.info(f"[OK] rclone copy => cryptLocal:{top_level_name}") if not os.path.isdir(self.config.ENCRYPTED_DIR): raise FileNotFoundError( f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config." ) def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""): """self.config.ENCRYPTED_DIR 以下の暗号化済ファイルを再帰的にアップロード""" max_retries = 5 for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR): for fn in files: encrypted_file_path = os.path.join(root, fn) if not os.path.isfile(encrypted_file_path): continue relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR) upload_path_in_repo = os.path.join(base_path_in_repo, relative_path) attempt = 0 while attempt < max_retries: try: self.api.upload_file( path_or_fileobj=encrypted_file_path, repo_id=repo_id, path_in_repo=upload_path_in_repo ) logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}") break except Exception as e: attempt += 1 error_message = str(e) # 429 Rate-limit with "in XX minutes" if "rate-limited" in error_message and "minutes" in error_message: import re match = re.search(r"in (\d+) minutes?", error_message) if match: minutes = int(match.group(1)) + 1 logger.warning(f"Rate-limited. Waiting {minutes} minutes...") time.sleep(minutes * 60) attempt -= 1 continue # 1時間待機 if "you can retry this action in about 1 hour" in error_message: logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...") time.sleep(3600) attempt -= 1 continue # ファイル上限 if "over the limit of 100000 files" in error_message: logger.warning("Repository file limit exceeded. Creating a new repository...") self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current']) self.api.create_repo(repo_id=self.repo_ids['current'], private=True) attempt = 0 repo_id = self.repo_ids['current'] continue if attempt < max_retries: logger.warning( f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..." ) else: logger.error(f"Failed to upload after {max_retries} attempts: {encrypted_file_path}") raise def upload_folder_encrypted(self, folder_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""): """フォルダを丸ごと暗号化してアップロード (=フォルダ名も暗号化)""" if not repo_id: repo_id = self.repo_ids['current'] self.encrypt_with_rclone(folder_path) self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo) # 暗号化フォルダを削除 if os.path.isdir(self.config.ENCRYPTED_DIR): shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True) # ============================================================================ # 単一ファイルを暗号化アップロードしてローカル削除 (old_versions用) # ============================================================================ def upload_file_encrypted_one_by_one(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""): """ 単一ファイルを暗号化アップロードし、アップロード後にローカルファイルを削除。 """ if not repo_id: repo_id = self.repo_ids['current'] self.encrypt_with_rclone(file_path) self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo) # 暗号化ディレクトリを削除 if os.path.isdir(self.config.ENCRYPTED_DIR): shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True) # ローカルの実ファイル削除 if os.path.exists(file_path): os.remove(file_path) @staticmethod def increment_repo_name(repo_id: str) -> str: match = re.search(r'(\d+)$', repo_id) if match: number = int(match.group(1)) + 1 return re.sub(r'\d+$', str(number), repo_id) else: return f"{repo_id}1" # ============================================================================ # ログや model_list.log は生アップロード # ============================================================================ def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None): if repo_id is None: repo_id = self.repo_ids['current'] if path_in_repo is None: path_in_repo = os.path.basename(file_path) max_retries = 5 attempt = 0 while attempt < max_retries: try: self.api.upload_file( path_or_fileobj=file_path, repo_id=repo_id, path_in_repo=path_in_repo ) logger.info(f"[OK] Uploaded {file_path} => {repo_id}/{path_in_repo}") return except Exception as e: attempt += 1 error_message = str(e) if "over the limit of 100000 files" in error_message: logger.warning("Repository file limit exceeded, creating a new repository.") self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current']) self.api.create_repo(repo_id=self.repo_ids['current'], private=True) attempt = 0 repo_id = self.repo_ids['current'] continue elif "you can retry this action in about 1 hour" in error_message: logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...") time.sleep(3600) attempt -= 1 else: if attempt < max_retries: logger.warning(f"Failed to upload raw file {file_path}, retry {attempt}/{max_retries}...") else: logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}") raise # ============================================================================ # ダウンロード処理 # ============================================================================ @staticmethod def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str: if content_disposition: parts = content_disposition.split(';') for part in parts: if "filename=" in part: return part.split("=")[1].strip().strip('"') return default_name def download_file(self, url: str, destination_folder: str, default_name: str): try: response = requests.get(url, headers=self.config.HEADERS, stream=True) response.raise_for_status() except requests.RequestException as e: logger.error(f"Failed to download file from {url}: {e}") return None filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name) file_path = os.path.join(destination_folder, filename) with open(file_path, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) logger.info(f"Download completed: {file_path}") return file_path # ============================================================================ # 古いバージョンのみ1ファイルずつアップロード # ============================================================================ def download_old_versions_one_by_one(self, version_list: list, folder: str): """version_list[1:] を対象に、モデルファイルを 1ファイルDL→upload→削除 を繰り返す""" if len(version_list) <= 1: return old_versions_folder = os.path.join(folder, "old_versions") os.makedirs(old_versions_folder, exist_ok=True) for version in version_list[1:]: for file_info in version.get("files", []): download_url = file_info["downloadUrl"] file_name = file_info["name"] local_path = self.download_file(download_url, old_versions_folder, file_name) if not local_path or not os.path.exists(local_path): logger.error(f"Failed to download or file not found: {file_name}") continue # 1つアップロードして削除 # path_in_repo を空文字にすればフォルダ名も暗号化される(トップレベル) # もしサブフォルダにまとめたいなら "old_versions" とか指定する self.upload_file_encrypted_one_by_one(local_path, path_in_repo="") # old_versions フォルダ内は空になったはずなので削除 if os.path.exists(old_versions_folder): shutil.rmtree(old_versions_folder, ignore_errors=True) # ============================================================================ # 従来どおり「最新バージョンのファイル一式 + images」フォルダを一括DL→アップロード # ============================================================================ def download_model(self, model_versions: list, folder: str): """最新バージョンを一括ダウンロード (フォルダにまとめる)""" latest_version = model_versions[0] latest_files = latest_version["files"] for file_info in latest_files: download_url = file_info["downloadUrl"] file_name = file_info["name"] local_path = self.download_file(download_url, folder, file_name) if local_path and os.path.exists(local_path): logger.info(f"Downloaded {file_name}") else: logger.warning(f"Could not download {file_name}") def download_images(self, model_versions: list, folder: str): images_folder = os.path.join(folder, "images") os.makedirs(images_folder, exist_ok=True) images = [] for version in model_versions: for img in version.get("images", []): images.append(img["url"]) for image_url in images: image_name = os.path.basename(image_url) + ".png" local_path = os.path.join(images_folder, image_name) try: resp = requests.get(image_url, stream=True) resp.raise_for_status() with open(local_path, "wb") as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) logger.info(f"Downloaded image: {local_path}") except Exception as e: logger.error(f"Error downloading image {image_url}: {e}") def save_html_content(self, url: str, folder: str): try: response = requests.get(url) response.raise_for_status() html_path = os.path.join(folder, os.path.basename(folder) + ".html") with open(html_path, 'w', encoding='utf-8') as file: file.write(response.text) except Exception as e: logger.error(f"Error saving HTML content for URL {url}: {e}") @staticmethod def save_model_info(model_info: dict, folder: str): with open(os.path.join(folder, "model_info.json"), "w", encoding="utf-8") as file: json.dump(model_info, file, indent=2) # ============================================================================ # model_list.log # ============================================================================ def read_model_list(self): model_list = {} try: with open(self.config.LIST_FILE, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue parts = line.split(": ", 1) if len(parts) == 2: stored_id, stored_url = parts model_list[stored_id] = stored_url except Exception as e: logger.error(f"Failed to read model list: {e}") return model_list # ============================================================================ # model 情報取得 # ============================================================================ def get_model_info(self, model_id: str) -> dict: try: url = self.config.URLS["modelId"] + str(model_id) resp = requests.get(url, headers=self.config.HEADERS) resp.raise_for_status() return resp.json() except requests.RequestException as e: logger.error(f"Failed to retrieve model info for ID {model_id}: {e}") return {} # ============================================================================ # メイン処理: 最新ファイル + images はフォルダごとアップロード。old_versions は1ファイルずつ。 # ============================================================================ def process_model(self, model_url: str): try: model_id = model_url.rstrip("/").split("/")[-1] model_info = self.get_model_info(model_id) if not model_info: logger.error(f"No model_info returned for {model_id}") return model_versions = model_info.get("modelVersions", []) if not model_versions: logger.error(f"No modelVersions in model info {model_id}") return latest_version = model_versions[0] model_file = next((file for file in latest_version["files"] if file.get('type') == 'Model'), None) if model_file: latest_filename = model_file['name'] folder = os.path.splitext(latest_filename)[0] else: first_file = latest_version["files"][0] latest_filename = first_file['name'] folder = os.path.splitext(latest_filename)[0] logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.") os.makedirs(folder, exist_ok=True) # すでにアップ済みかどうか model_list.log でチェック (モデル名ベース) model_list = self.read_model_list() modelpage_name = model_info.get("name", f"Model_{model_id}") if modelpage_name in model_list.values(): logger.info(f"Model '{modelpage_name}' already in model_list. Skipping.") # 必要ならreturn # 最新バージョン (まとめてダウンロード) self.download_model(model_versions, folder) # 画像 (imagesフォルダまるごとダウンロード) self.download_images(model_versions, folder) # HTML & model_info.json self.save_html_content(self.config.URLS["modelPage"] + str(model_id), folder) self.save_model_info(model_info, folder) # 古いバージョンのみ「1つずつアップロード&削除」 self.download_old_versions_one_by_one(model_versions, folder) # ↑で old_versions は空になった → あとはフォルダに残っているのは # 最新バージョンファイル・imagesフォルダ・model_info.json・HTML など # "folder" 自体を暗号化アップロード (= images フォルダごとアップロード) # path_in_repo を "" にすればフォルダ名も暗号化される self.upload_folder_encrypted(folder, path_in_repo="") # ローカルフォルダ削除 if os.path.exists(folder): shutil.rmtree(folder) # model_list.log に追記 # HF上では folder名 も暗号化されるが、ここでは元の "modelpage_name" と # HFへのトップフォルダ参照URLを書く model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main" with open(self.config.LIST_FILE, "a", encoding="utf-8") as f: f.write(f"{modelpage_name}: {model_hf_url}\n") except Exception as e: logger.error(f"Unexpected error processing model ({model_url}): {e}") # ============================================================================ # crawl # ============================================================================ async def crawl(self): while True: try: login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True) # 最新の model_list.log & civitai_backup.log をダウンロード model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE) shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}") local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE) shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}") # ログ読み込み with open(self.config.LOG_FILE, "r", encoding="utf-8") as file: lines = file.read().splitlines() old_models = json.loads(lines[0]) if len(lines) > 0 else [] self.repo_ids["current"] = lines[1] if len(lines) > 1 else "" # 新着モデル確認 response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS) response.raise_for_status() latest_models = response.json().get("items", []) latest_model_ids = [m["id"] for m in latest_models if "id" in m] # 差集合 new_models = list(set(latest_model_ids) - set(old_models)) if new_models: logger.info(f"New models found: {new_models}") model_id = new_models[0] for attempt in range(1, 6): try: self.process_model(f"{self.config.URLS['modelId']}{model_id}") break except Exception as e: logger.error(f"Failed to process model ID {model_id} (Attempt {attempt}/5): {e}") if attempt == 5: logger.error(f"Skipping model ID {model_id} after 5 failed attempts.") else: await asyncio.sleep(2) else: # 新モデルなし with open(self.config.LOG_FILE, "w", encoding="utf-8") as f: f.write(json.dumps(latest_model_ids) + "\n") f.write(f"{self.repo_ids['current']}\n") logger.info(f"Updated log file: {self.config.LOG_FILE}") self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) logger.info("Uploaded log file to repository (unencrypted).") logger.info("No new models found.") await asyncio.sleep(60) continue # 1件アップロードに成功したら old_models に追加 old_models.append(model_id) with open(self.config.LOG_FILE, "w", encoding="utf-8") as f: f.write(json.dumps(old_models) + "\n") f.write(f"{self.repo_ids['current']}\n") logger.info(f"Updated log file with new model ID: {model_id}") # ログと model_list.log をアップ self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE) except Exception as e: logger.error(f"Error during crawling: {e}") await asyncio.sleep(300) # 実行 config = Config() crawler = CivitAICrawler(config) app = crawler.app