import asyncio import datetime import json import logging import os import re import shutil import subprocess import time import uuid from typing import Optional import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent from fastapi import FastAPI from huggingface_hub import HfApi, hf_hub_download, login logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class Config: HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"] CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"] LOG_FILE = "civitai_backup.log" LIST_FILE = "model_list.log" REPO_IDS = { "log": "ttttdiva/CivitAI_log_test", "model_list": "ttttdiva/CivitAI_model_info_test", "current": "" } URLS = { "latest": "https://civitai.com/api/v1/models?sort=Newest", "modelPage": "https://civitai.com/models/", "modelId": "https://civitai.com/api/v1/models/", "modelVersionId": "https://civitai.com/api/v1/model-versions/", "hash": "https://civitai.com/api/v1/model-versions/by-hash/" } JST = datetime.timezone(datetime.timedelta(hours=9)) UA = UserAgent() HEADERS = { 'Authorization': f'Bearer {CIVITAI_API_TOKEN}', 'User-Agent': UA.random, "Content-Type": "application/json" } class CivitAICrawler: def __init__(self, config: Config): self.config = config self.api = HfApi() self.app = FastAPI() self.repo_ids = self.config.REPO_IDS.copy() self.jst = self.config.JST self.setup_rclone_conf() self.setup_routes() def setup_rclone_conf(self): # RCLONE_CONF_BASE64→rclone.conf復元 import base64 rclone_b64 = os.environ.get("RCLONE_CONF_BASE64", "") if not rclone_b64: logger.warning("[WARN] RCLONE_CONF_BASE64 is empty, rclone may fail.") return conf_dir = ".rclone_config" os.makedirs(conf_dir, exist_ok=True) conf_path = os.path.join(conf_dir, "rclone.conf") with open(conf_path, "wb") as f: f.write(base64.b64decode(rclone_b64)) os.environ["RCLONE_CONFIG"] = conf_path logger.info(f"[OK] rclone.conf => {conf_path}") def setup_routes(self): @self.app.get("/") def read_root(): now = str(datetime.datetime.now(self.jst)) return { "description": f"CivitAI crawler. Current time: {now}", "repo_current": self.repo_ids["current"], "note": "Startup event => self.crawl() loop" } @self.app.on_event("startup") async def startup_event(): asyncio.create_task(self.crawl()) def download_file(self, url: str, destination_folder: str, default_name: str) -> Optional[str]: os.makedirs(destination_folder, exist_ok=True) try: resp = requests.get(url, headers=self.config.HEADERS, stream=True) resp.raise_for_status() except requests.RequestException as e: logger.error(f"[ERR] download_file fail: {e}") return None file_path = os.path.join(destination_folder, default_name) with open(file_path, 'wb') as f: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) logger.info(f"[OK] Downloaded: {file_path}") return file_path def get_filename_from_cd(self, cd: Optional[str], default_name: str) -> str: if cd: parts = cd.split(';') for p in parts: if "filename=" in p: return p.split("=")[1].strip().strip('"') return default_name def get_model_info(self, model_id: str) -> dict: try: r = requests.get(f"{self.config.URLS['modelId']}{model_id}", headers=self.config.HEADERS) r.raise_for_status() return r.json() except Exception as e: logger.error(f"[ERR] get_model_info({model_id}): {e}") return {} def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]: """ rclone copy local_folder => cryptLocal: => diff => upload_folder => cleanup """ if not os.path.isdir(local_folder): logger.error(f"[ERR] {local_folder} is not a directory.") return None encrypted_dir = os.path.join(os.getcwd(), "encrypted") os.makedirs(encrypted_dir, exist_ok=True) before = set(os.listdir(encrypted_dir)) try: subprocess.run(["rclone", "copy", local_folder, "cryptLocal:", "--create-empty-src-dirs"], check=True) logger.info(f"[OK] rclone copy {local_folder} => cryptLocal:") except subprocess.CalledProcessError as e: logger.error(f"[ERR] rclone copy failed: {e}") return None after = set(os.listdir(encrypted_dir)) diff = after - before if not diff: logger.error("[ERR] No new dir in ./encrypted after rclone copy.") return None if len(diff) > 1: logger.warning(f"[WARN] multiple new dirs => {diff}") enc_folder_name = diff.pop() enc_folder_path = os.path.join(encrypted_dir, enc_folder_name) if not os.path.isdir(enc_folder_path): logger.error(f"[ERR] {enc_folder_path} is not a directory.") return None # upload_folder to HF try: self.api.upload_folder( folder_path=enc_folder_path, repo_id=self.repo_ids["current"], path_in_repo=enc_folder_name ) logger.info(f"[OK] uploaded {enc_folder_path} => {self.repo_ids['current']}:{enc_folder_name}") except Exception as e: logger.error(f"[ERR] HF upload_folder fail: {e}") # cleanup local shutil.rmtree(local_folder, ignore_errors=True) shutil.rmtree(enc_folder_path, ignore_errors=True) logger.info(f"[CLEANUP] removed {local_folder} & {enc_folder_path}") return enc_folder_name def process_model(self, model_url: str): try: model_id = model_url.rstrip("/").split("/")[-1] model_info = self.get_model_info(model_id) if not model_info or "modelVersions" not in model_info: logger.error(f"No valid model info for ID {model_id}. Skipping.") return versions = model_info["modelVersions"] if not versions: logger.warning(f"No modelVersions found for ID {model_id}.") return # 1) 単にモデル名をサニタイズしたディレクトリにまとめる folder_name = re.sub(r'[\\/*?:"<>|]', '_', model_info.get("name", "UnknownModel")) # フォルダが既にあれば削除し、新規に作り直す (上書き) if os.path.exists(folder_name): shutil.rmtree(folder_name) os.makedirs(folder_name, exist_ok=True) # ダウンロード (最新/old_versions), 画像, HTML, model_info.json など self.download_and_process_versions(versions, folder_name) self.download_images(versions, folder_name) self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name) self.save_model_info_json(model_info, folder_name) # 暗号化アップロード enc_subfolder = self.encrypt_and_upload_folder(folder_name) if enc_subfolder is None: enc_subfolder = "[ENCRYPT_FAILED]" hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}" with open(self.config.LIST_FILE, "a", encoding="utf-8") as f: f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n") except Exception as e: logger.error(f"Error in process_model({model_url}): {e}") def download_versions(self, model_versions: list, folder: str): # すべて folder/ にまとめる or old_versions subfolder # 例: 最新 => folder, old => folder/old_versions if not model_versions: return latest = model_versions[0] for f_info in latest.get("files", []): url = f_info["downloadUrl"] fname = f_info["name"] self.download_file(url, folder, fname) if len(model_versions) > 1: ov_folder = os.path.join(folder, "old_versions") os.makedirs(ov_folder, exist_ok=True) for v in model_versions[1:]: for f_info in v.get("files", []): url = f_info["downloadUrl"] fname = f_info["name"] self.download_file(url, ov_folder, fname) def download_images(self, model_versions: list, folder: str): images_folder = os.path.join(folder, "images") os.makedirs(images_folder, exist_ok=True) for v in model_versions: for im in v.get("images", []): iurl = im["url"] iname = os.path.basename(iurl) self.download_file(iurl, images_folder, iname) async def crawl(self): while True: try: # HF Login login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True) # model_list.log mlist_path = hf_hub_download(self.repo_ids["model_list"], self.config.LIST_FILE) shutil.copyfile(mlist_path, f"./{self.config.LIST_FILE}") # log_file log_path = hf_hub_download(self.repo_ids["log"], self.config.LOG_FILE) shutil.copyfile(log_path, f"./{self.config.LOG_FILE}") # read logs with open(self.config.LOG_FILE, "r", encoding="utf-8") as f: lines = f.read().splitlines() old_models = json.loads(lines[0]) if len(lines)>0 else [] self.repo_ids["current"] = lines[1] if len(lines)>1 else "" # get newest r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS) r.raise_for_status() items = r.json().get("items", []) ids = [it["id"] for it in items if "id" in it] new_ids = list(set(ids)-set(old_models)) if new_ids: mid = new_ids[0] for attempt in range(1,6): try: self.process_model(str(mid)) break except Exception as e: logger.error(f"[ERR] process_model {mid}, attempt {attempt}: {e}") if attempt==5: logger.error("Skipping model after 5 fails") else: await asyncio.sleep(2) # update logs old_models.append(mid) with open(self.config.LOG_FILE, "w", encoding="utf-8") as f: f.write(json.dumps(old_models)+"\n") f.write(self.repo_ids["current"]+"\n") self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) self.upload_file(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE) else: # no new with open(self.config.LOG_FILE,"w",encoding="utf-8") as f: f.write(json.dumps(ids)+"\n") f.write(self.repo_ids["current"]+"\n") self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) logger.info("No new models => wait 60s") await asyncio.sleep(60) continue except Exception as e: logger.error(f"[ERR] crawl loop => {e}") await asyncio.sleep(300) # === FastAPI config = Config() crawler = CivitAICrawler(config) app = crawler.app