|
import asyncio |
|
import datetime |
|
import json |
|
import logging |
|
import os |
|
import re |
|
import shutil |
|
import subprocess |
|
import time |
|
import uuid |
|
from typing import Optional |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from fake_useragent import UserAgent |
|
from fastapi import FastAPI |
|
from huggingface_hub import HfApi, hf_hub_download, login |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class Config: |
|
HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"] |
|
CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"] |
|
LOG_FILE = "civitai_backup.log" |
|
LIST_FILE = "model_list.log" |
|
REPO_IDS = { |
|
"log": "ttttdiva/CivitAI_log_test", |
|
"model_list": "ttttdiva/CivitAI_model_info_test", |
|
"current": "" |
|
} |
|
URLS = { |
|
"latest": "https://civitai.com/api/v1/models?sort=Newest", |
|
"modelPage": "https://civitai.com/models/", |
|
"modelId": "https://civitai.com/api/v1/models/", |
|
"modelVersionId": "https://civitai.com/api/v1/model-versions/", |
|
"hash": "https://civitai.com/api/v1/model-versions/by-hash/" |
|
} |
|
JST = datetime.timezone(datetime.timedelta(hours=9)) |
|
UA = UserAgent() |
|
HEADERS = { |
|
'Authorization': f'Bearer {CIVITAI_API_TOKEN}', |
|
'User-Agent': UA.random, |
|
"Content-Type": "application/json" |
|
} |
|
|
|
class CivitAICrawler: |
|
def __init__(self, config: Config): |
|
self.config = config |
|
self.api = HfApi() |
|
self.app = FastAPI() |
|
self.repo_ids = self.config.REPO_IDS.copy() |
|
self.jst = self.config.JST |
|
self.setup_rclone_conf() |
|
self.setup_routes() |
|
|
|
def setup_rclone_conf(self): |
|
|
|
import base64 |
|
rclone_b64 = os.environ.get("RCLONE_CONF_BASE64", "") |
|
if not rclone_b64: |
|
logger.warning("[WARN] RCLONE_CONF_BASE64 is empty, rclone may fail.") |
|
return |
|
conf_dir = ".rclone_config" |
|
os.makedirs(conf_dir, exist_ok=True) |
|
conf_path = os.path.join(conf_dir, "rclone.conf") |
|
with open(conf_path, "wb") as f: |
|
f.write(base64.b64decode(rclone_b64)) |
|
os.environ["RCLONE_CONFIG"] = conf_path |
|
logger.info(f"[OK] rclone.conf => {conf_path}") |
|
|
|
def setup_routes(self): |
|
@self.app.get("/") |
|
def read_root(): |
|
now = str(datetime.datetime.now(self.jst)) |
|
return { |
|
"description": f"CivitAI crawler. Current time: {now}", |
|
"repo_current": self.repo_ids["current"], |
|
"note": "Startup event => self.crawl() loop" |
|
} |
|
|
|
@self.app.on_event("startup") |
|
async def startup_event(): |
|
asyncio.create_task(self.crawl()) |
|
|
|
def download_file(self, url: str, destination_folder: str, default_name: str) -> Optional[str]: |
|
os.makedirs(destination_folder, exist_ok=True) |
|
try: |
|
resp = requests.get(url, headers=self.config.HEADERS, stream=True) |
|
resp.raise_for_status() |
|
except requests.RequestException as e: |
|
logger.error(f"[ERR] download_file fail: {e}") |
|
return None |
|
|
|
file_path = os.path.join(destination_folder, default_name) |
|
with open(file_path, 'wb') as f: |
|
for chunk in resp.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
logger.info(f"[OK] Downloaded: {file_path}") |
|
return file_path |
|
|
|
def get_filename_from_cd(self, cd: Optional[str], default_name: str) -> str: |
|
if cd: |
|
parts = cd.split(';') |
|
for p in parts: |
|
if "filename=" in p: |
|
return p.split("=")[1].strip().strip('"') |
|
return default_name |
|
|
|
def get_model_info(self, model_id: str) -> dict: |
|
try: |
|
r = requests.get(f"{self.config.URLS['modelId']}{model_id}", headers=self.config.HEADERS) |
|
r.raise_for_status() |
|
return r.json() |
|
except Exception as e: |
|
logger.error(f"[ERR] get_model_info({model_id}): {e}") |
|
return {} |
|
|
|
def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]: |
|
""" |
|
rclone copy local_folder => cryptLocal: |
|
=> diff => upload_folder => cleanup |
|
""" |
|
if not os.path.isdir(local_folder): |
|
logger.error(f"[ERR] {local_folder} is not a directory.") |
|
return None |
|
encrypted_dir = os.path.join(os.getcwd(), "encrypted") |
|
os.makedirs(encrypted_dir, exist_ok=True) |
|
|
|
before = set(os.listdir(encrypted_dir)) |
|
|
|
try: |
|
subprocess.run(["rclone", "copy", local_folder, "cryptLocal:", "--create-empty-src-dirs"], check=True) |
|
logger.info(f"[OK] rclone copy {local_folder} => cryptLocal:") |
|
except subprocess.CalledProcessError as e: |
|
logger.error(f"[ERR] rclone copy failed: {e}") |
|
return None |
|
|
|
after = set(os.listdir(encrypted_dir)) |
|
diff = after - before |
|
if not diff: |
|
logger.error("[ERR] No new dir in ./encrypted after rclone copy.") |
|
return None |
|
if len(diff) > 1: |
|
logger.warning(f"[WARN] multiple new dirs => {diff}") |
|
enc_folder_name = diff.pop() |
|
enc_folder_path = os.path.join(encrypted_dir, enc_folder_name) |
|
|
|
if not os.path.isdir(enc_folder_path): |
|
logger.error(f"[ERR] {enc_folder_path} is not a directory.") |
|
return None |
|
|
|
|
|
try: |
|
self.api.upload_folder( |
|
folder_path=enc_folder_path, |
|
repo_id=self.repo_ids["current"], |
|
path_in_repo=enc_folder_name |
|
) |
|
logger.info(f"[OK] uploaded {enc_folder_path} => {self.repo_ids['current']}:{enc_folder_name}") |
|
except Exception as e: |
|
logger.error(f"[ERR] HF upload_folder fail: {e}") |
|
|
|
|
|
shutil.rmtree(local_folder, ignore_errors=True) |
|
shutil.rmtree(enc_folder_path, ignore_errors=True) |
|
logger.info(f"[CLEANUP] removed {local_folder} & {enc_folder_path}") |
|
|
|
return enc_folder_name |
|
|
|
def process_model(self, model_url: str): |
|
try: |
|
model_id = model_url.rstrip("/").split("/")[-1] |
|
model_info = self.get_model_info(model_id) |
|
if not model_info or "modelVersions" not in model_info: |
|
logger.error(f"No valid model info for ID {model_id}. Skipping.") |
|
return |
|
|
|
versions = model_info["modelVersions"] |
|
if not versions: |
|
logger.warning(f"No modelVersions found for ID {model_id}.") |
|
return |
|
|
|
|
|
folder_name = re.sub(r'[\\/*?:"<>|]', '_', model_info.get("name", "UnknownModel")) |
|
|
|
|
|
if os.path.exists(folder_name): |
|
shutil.rmtree(folder_name) |
|
os.makedirs(folder_name, exist_ok=True) |
|
|
|
|
|
self.download_and_process_versions(versions, folder_name) |
|
self.download_images(versions, folder_name) |
|
self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name) |
|
self.save_model_info_json(model_info, folder_name) |
|
|
|
|
|
enc_subfolder = self.encrypt_and_upload_folder(folder_name) |
|
if enc_subfolder is None: |
|
enc_subfolder = "[ENCRYPT_FAILED]" |
|
|
|
hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}" |
|
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f: |
|
f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n") |
|
|
|
except Exception as e: |
|
logger.error(f"Error in process_model({model_url}): {e}") |
|
|
|
def download_versions(self, model_versions: list, folder: str): |
|
|
|
|
|
if not model_versions: |
|
return |
|
latest = model_versions[0] |
|
for f_info in latest.get("files", []): |
|
url = f_info["downloadUrl"] |
|
fname = f_info["name"] |
|
self.download_file(url, folder, fname) |
|
|
|
if len(model_versions) > 1: |
|
ov_folder = os.path.join(folder, "old_versions") |
|
os.makedirs(ov_folder, exist_ok=True) |
|
for v in model_versions[1:]: |
|
for f_info in v.get("files", []): |
|
url = f_info["downloadUrl"] |
|
fname = f_info["name"] |
|
self.download_file(url, ov_folder, fname) |
|
|
|
def download_images(self, model_versions: list, folder: str): |
|
images_folder = os.path.join(folder, "images") |
|
os.makedirs(images_folder, exist_ok=True) |
|
for v in model_versions: |
|
for im in v.get("images", []): |
|
iurl = im["url"] |
|
iname = os.path.basename(iurl) |
|
self.download_file(iurl, images_folder, iname) |
|
|
|
async def crawl(self): |
|
while True: |
|
try: |
|
|
|
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True) |
|
|
|
|
|
mlist_path = hf_hub_download(self.repo_ids["model_list"], self.config.LIST_FILE) |
|
shutil.copyfile(mlist_path, f"./{self.config.LIST_FILE}") |
|
|
|
|
|
log_path = hf_hub_download(self.repo_ids["log"], self.config.LOG_FILE) |
|
shutil.copyfile(log_path, f"./{self.config.LOG_FILE}") |
|
|
|
|
|
with open(self.config.LOG_FILE, "r", encoding="utf-8") as f: |
|
lines = f.read().splitlines() |
|
old_models = json.loads(lines[0]) if len(lines)>0 else [] |
|
self.repo_ids["current"] = lines[1] if len(lines)>1 else "" |
|
|
|
|
|
r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS) |
|
r.raise_for_status() |
|
items = r.json().get("items", []) |
|
ids = [it["id"] for it in items if "id" in it] |
|
|
|
new_ids = list(set(ids)-set(old_models)) |
|
if new_ids: |
|
mid = new_ids[0] |
|
for attempt in range(1,6): |
|
try: |
|
self.process_model(str(mid)) |
|
break |
|
except Exception as e: |
|
logger.error(f"[ERR] process_model {mid}, attempt {attempt}: {e}") |
|
if attempt==5: |
|
logger.error("Skipping model after 5 fails") |
|
else: |
|
await asyncio.sleep(2) |
|
|
|
old_models.append(mid) |
|
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f: |
|
f.write(json.dumps(old_models)+"\n") |
|
f.write(self.repo_ids["current"]+"\n") |
|
|
|
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) |
|
self.upload_file(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE) |
|
else: |
|
|
|
with open(self.config.LOG_FILE,"w",encoding="utf-8") as f: |
|
f.write(json.dumps(ids)+"\n") |
|
f.write(self.repo_ids["current"]+"\n") |
|
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) |
|
logger.info("No new models => wait 60s") |
|
await asyncio.sleep(60) |
|
continue |
|
except Exception as e: |
|
logger.error(f"[ERR] crawl loop => {e}") |
|
await asyncio.sleep(300) |
|
|
|
|
|
config = Config() |
|
crawler = CivitAICrawler(config) |
|
app = crawler.app |
|
|