|
import asyncio |
|
import datetime |
|
import json |
|
import logging |
|
import os |
|
import re |
|
import shutil |
|
import subprocess |
|
import time |
|
import uuid |
|
from typing import Optional |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from fake_useragent import UserAgent |
|
from fastapi import FastAPI |
|
from huggingface_hub import HfApi, hf_hub_download, login |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
class Config: |
|
HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"] |
|
CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"] |
|
LOG_FILE = "civitai_backup.log" |
|
LIST_FILE = "model_list.log" |
|
REPO_IDS = { |
|
"log": "ttttdiva/CivitAI_log_test", |
|
"model_list": "ttttdiva/CivitAI_model_info_test", |
|
"current": "" |
|
} |
|
URLS = { |
|
"latest": "https://civitai.com/api/v1/models?sort=Newest", |
|
"modelPage": "https://civitai.com/models/", |
|
"modelId": "https://civitai.com/api/v1/models/", |
|
"modelVersionId": "https://civitai.com/api/v1/model-versions/", |
|
"hash": "https://civitai.com/api/v1/model-versions/by-hash/" |
|
} |
|
JST = datetime.timezone(datetime.timedelta(hours=9)) |
|
UA = UserAgent() |
|
HEADERS = { |
|
'Authorization': f'Bearer {CIVITAI_API_TOKEN}', |
|
'User-Agent': UA.random, |
|
"Content-Type": "application/json" |
|
} |
|
|
|
class CivitAICrawler: |
|
def __init__(self, config: Config): |
|
self.config = config |
|
self.api = HfApi() |
|
self.app = FastAPI() |
|
self.repo_ids = self.config.REPO_IDS.copy() |
|
self.jst = self.config.JST |
|
self.setup_rclone_conf() |
|
self.setup_routes() |
|
|
|
def setup_rclone_conf(self): |
|
import base64 |
|
rclone_b64 = os.environ.get("RCLONE_CONF_BASE64", "") |
|
if rclone_b64: |
|
conf_dir = ".rclone_config" |
|
os.makedirs(conf_dir, exist_ok=True) |
|
conf_path = os.path.join(conf_dir, "rclone.conf") |
|
with open(conf_path, "wb") as f: |
|
f.write(base64.b64decode(rclone_b64)) |
|
os.environ["RCLONE_CONFIG"] = conf_path |
|
logger.info(f"[OK] Created rclone.conf => {conf_path}") |
|
else: |
|
logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.") |
|
|
|
def setup_routes(self): |
|
@self.app.get("/") |
|
def read_root(): |
|
now = str(datetime.datetime.now(self.jst)) |
|
return { |
|
"description": f"CivitAI crawler. Time: {now}", |
|
"repo_current": self.repo_ids["current"] |
|
} |
|
|
|
@self.app.on_event("startup") |
|
async def startup_event(): |
|
asyncio.create_task(self.crawl()) |
|
|
|
|
|
|
|
def download_file(self, url: str, dest_folder: str, filename: str) -> Optional[str]: |
|
"""実際にファイルをダウンロードし、dest_folder/filename に保存。""" |
|
os.makedirs(dest_folder, exist_ok=True) |
|
try: |
|
r = requests.get(url, headers=self.config.HEADERS, stream=True) |
|
r.raise_for_status() |
|
except requests.RequestException as e: |
|
logger.error(f"[ERR] download_file: {e}") |
|
return None |
|
|
|
file_path = os.path.join(dest_folder, filename) |
|
with open(file_path, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
logger.info(f"[OK] Downloaded => {file_path}") |
|
return file_path |
|
|
|
def upload_file(self, file_path: str, repo_id: Optional[str]=None, path_in_repo: Optional[str]=None): |
|
"""単一ファイルアップロード用メソッド""" |
|
if repo_id is None: |
|
repo_id = self.repo_ids["current"] |
|
if path_in_repo is None: |
|
path_in_repo = os.path.basename(file_path) |
|
|
|
try: |
|
self.api.upload_file( |
|
path_or_fileobj=file_path, |
|
repo_id=repo_id, |
|
path_in_repo=path_in_repo |
|
) |
|
logger.info(f"[OK] Uploaded file => {repo_id}:{path_in_repo}") |
|
except Exception as e: |
|
logger.error(f"[ERR] upload_file: {e}") |
|
|
|
def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None): |
|
"""フォルダ丸ごとアップロード""" |
|
if path_in_repo is None: |
|
path_in_repo = os.path.basename(folder_path) |
|
|
|
try: |
|
self.api.upload_folder( |
|
folder_path=folder_path, |
|
repo_id=self.repo_ids["current"], |
|
path_in_repo=path_in_repo |
|
) |
|
logger.info(f"[OK] uploaded folder => {folder_path} => {self.repo_ids['current']}:{path_in_repo}") |
|
except Exception as e: |
|
logger.error(f"[ERR] upload_folder: {e}") |
|
|
|
def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]: |
|
"""local_folder -> cryptLocal: => encrypted/??? => upload_folder => cleanup""" |
|
if not os.path.isdir(local_folder): |
|
logger.error(f"[ERR] {local_folder} is not a directory.") |
|
return None |
|
|
|
encrypted_dir = os.path.join(os.getcwd(), "encrypted") |
|
os.makedirs(encrypted_dir, exist_ok=True) |
|
|
|
before = set(os.listdir(encrypted_dir)) |
|
|
|
cmd = ["rclone", "copy", local_folder, "cryptLocal:", "--create-empty-src-dirs"] |
|
logger.info(f"[CMD] {' '.join(cmd)}") |
|
try: |
|
subprocess.run(cmd, check=True) |
|
logger.info(f"[OK] rclone copy => cryptLocal:") |
|
except subprocess.CalledProcessError as e: |
|
logger.error(f"[ERR] rclone copy failed: {e}") |
|
return None |
|
|
|
after = set(os.listdir(encrypted_dir)) |
|
diff = after - before |
|
if not diff: |
|
logger.error("[ERR] no new directory in ./encrypted after copy") |
|
return None |
|
if len(diff) > 1: |
|
logger.warning(f"[WARN] multiple new dirs => {diff}") |
|
enc_name = diff.pop() |
|
enc_path = os.path.join(encrypted_dir, enc_name) |
|
if not os.path.isdir(enc_path): |
|
logger.error(f"[ERR] {enc_path} is not a directory.") |
|
return None |
|
|
|
|
|
try: |
|
self.upload_folder(enc_path, path_in_repo=enc_name) |
|
except Exception as e: |
|
logger.error(f"[ERR] encrypt_and_upload_folder => upload_folder: {e}") |
|
|
|
|
|
shutil.rmtree(local_folder, ignore_errors=True) |
|
shutil.rmtree(enc_path, ignore_errors=True) |
|
logger.info(f"[CLEANUP] removed {local_folder} & {enc_path}") |
|
return enc_name |
|
|
|
|
|
|
|
|
|
def download_and_process_versions(self, model_versions: list, folder: str): |
|
""" |
|
例: 最新バージョン => folder |
|
古いバージョン => folder/old_versions |
|
""" |
|
if not model_versions: |
|
return |
|
|
|
|
|
latest = model_versions[0] |
|
for f_info in latest.get("files", []): |
|
url = f_info["downloadUrl"] |
|
fname = f_info["name"] |
|
self.download_file(url, folder, fname) |
|
|
|
|
|
if len(model_versions) > 1: |
|
ov_folder = os.path.join(folder, "old_versions") |
|
os.makedirs(ov_folder, exist_ok=True) |
|
for v in model_versions[1:]: |
|
for f_info in v.get("files", []): |
|
url = f_info["downloadUrl"] |
|
fname = f_info["name"] |
|
self.download_file(url, ov_folder, fname) |
|
|
|
def get_model_info(self, model_id: str) -> dict: |
|
"""モデルIDからCivitAIの情報を取得""" |
|
try: |
|
url = f"{self.config.URLS['modelId']}{model_id}" |
|
resp = requests.get(url, headers=self.config.HEADERS) |
|
resp.raise_for_status() |
|
return resp.json() |
|
except Exception as e: |
|
logger.error(f"[ERR] get_model_info({model_id}): {e}") |
|
return {} |
|
|
|
def process_model(self, model_id: str): |
|
info = self.get_model_info(model_id) |
|
if not info or "modelVersions" not in info: |
|
logger.error(f"[ERR] No modelVersions for {model_id}") |
|
return |
|
|
|
versions = info["modelVersions"] |
|
|
|
|
|
base_dir = "local_models" |
|
os.makedirs(base_dir, exist_ok=True) |
|
|
|
|
|
folder_name = str(model_id) |
|
folder_path = os.path.join(base_dir, folder_name) |
|
|
|
|
|
if os.path.exists(folder_path): |
|
shutil.rmtree(folder_path) |
|
os.makedirs(folder_path, exist_ok=True) |
|
|
|
logger.info(f"[OK] created local folder => {folder_path}") |
|
|
|
|
|
|
|
for v in versions: |
|
for f_info in v.get("files", []): |
|
dl_url = f_info["downloadUrl"] |
|
fname = f_info["name"] |
|
self.download_file(dl_url, folder_path, fname) |
|
|
|
|
|
self.download_images(versions, folder_path) |
|
|
|
|
|
self.encrypt_and_upload_folder(folder_path) |
|
|
|
async def crawl(self): |
|
while True: |
|
try: |
|
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True) |
|
|
|
mlist_path = hf_hub_download(repo_id=self.repo_ids["model_list"], filename=self.config.LIST_FILE) |
|
shutil.copyfile(mlist_path, f"./{self.config.LIST_FILE}") |
|
|
|
lfile_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE) |
|
shutil.copyfile(lfile_path, f"./{self.config.LOG_FILE}") |
|
|
|
|
|
with open(self.config.LOG_FILE, 'r', encoding='utf-8') as f: |
|
lines = f.read().splitlines() |
|
old_models = json.loads(lines[0]) if len(lines)>0 else [] |
|
self.repo_ids["current"] = lines[1] if len(lines)>1 else "" |
|
|
|
|
|
r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS) |
|
r.raise_for_status() |
|
items = r.json().get("items", []) |
|
new_ids = [it["id"] for it in items if "id" in it] |
|
|
|
|
|
diff_ids = list(set(new_ids) - set(old_models)) |
|
if diff_ids: |
|
mid = diff_ids[0] |
|
for attempt in range(1,6): |
|
try: |
|
self.process_model(str(mid)) |
|
break |
|
except Exception as e: |
|
logger.error(f"[ERR] process_model {mid} (attempt {attempt}): {e}") |
|
if attempt==5: |
|
logger.error(f"[SKIP] model {mid} after 5 fails") |
|
else: |
|
await asyncio.sleep(2) |
|
|
|
old_models.append(mid) |
|
with open(self.config.LOG_FILE,'w',encoding='utf-8') as f: |
|
f.write(json.dumps(old_models)+"\n") |
|
f.write(self.repo_ids["current"]+"\n") |
|
|
|
|
|
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) |
|
self.upload_file(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE) |
|
else: |
|
with open(self.config.LOG_FILE,'w',encoding='utf-8') as f: |
|
f.write(json.dumps(new_ids)+"\n") |
|
f.write(self.repo_ids["current"]+"\n") |
|
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE) |
|
logger.info("[INFO] no new models => sleep(60)") |
|
await asyncio.sleep(60) |
|
continue |
|
except Exception as e: |
|
logger.error(f"[ERR] crawl => {e}") |
|
await asyncio.sleep(300) |
|
|
|
|
|
config = Config() |
|
crawler = CivitAICrawler(config) |
|
app = crawler.app |
|
|