cv_test / main.py
ttttdiva's picture
Update main.py
f52b391 verified
raw
history blame
12.3 kB
import asyncio
import datetime
import json
import logging
import os
import re
import shutil
import subprocess
import time
import uuid
from typing import Optional
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from fastapi import FastAPI
from huggingface_hub import HfApi, hf_hub_download, login
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Config:
HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
LOG_FILE = "civitai_backup.log"
LIST_FILE = "model_list.log"
REPO_IDS = {
"log": "ttttdiva/CivitAI_log_test",
"model_list": "ttttdiva/CivitAI_model_info_test",
"current": ""
}
URLS = {
"latest": "https://civitai.com/api/v1/models?sort=Newest",
"modelPage": "https://civitai.com/models/",
"modelId": "https://civitai.com/api/v1/models/",
"modelVersionId": "https://civitai.com/api/v1/model-versions/",
"hash": "https://civitai.com/api/v1/model-versions/by-hash/"
}
JST = datetime.timezone(datetime.timedelta(hours=9))
UA = UserAgent()
HEADERS = {
'Authorization': f'Bearer {CIVITAI_API_TOKEN}',
'User-Agent': UA.random,
"Content-Type": "application/json"
}
class CivitAICrawler:
def __init__(self, config: Config):
self.config = config
self.api = HfApi()
self.app = FastAPI()
self.repo_ids = self.config.REPO_IDS.copy()
self.jst = self.config.JST
self.setup_rclone_conf()
self.setup_routes()
def setup_rclone_conf(self):
# RCLONE_CONF_BASE64β†’rclone.confεΎ©ε…ƒ
import base64
rclone_b64 = os.environ.get("RCLONE_CONF_BASE64", "")
if not rclone_b64:
logger.warning("[WARN] RCLONE_CONF_BASE64 is empty, rclone may fail.")
return
conf_dir = ".rclone_config"
os.makedirs(conf_dir, exist_ok=True)
conf_path = os.path.join(conf_dir, "rclone.conf")
with open(conf_path, "wb") as f:
f.write(base64.b64decode(rclone_b64))
os.environ["RCLONE_CONFIG"] = conf_path
logger.info(f"[OK] rclone.conf => {conf_path}")
def setup_routes(self):
@self.app.get("/")
def read_root():
now = str(datetime.datetime.now(self.jst))
return {
"description": f"CivitAI crawler. Current time: {now}",
"repo_current": self.repo_ids["current"],
"note": "Startup event => self.crawl() loop"
}
@self.app.on_event("startup")
async def startup_event():
asyncio.create_task(self.crawl())
def download_file(self, url: str, destination_folder: str, default_name: str) -> Optional[str]:
os.makedirs(destination_folder, exist_ok=True)
try:
resp = requests.get(url, headers=self.config.HEADERS, stream=True)
resp.raise_for_status()
except requests.RequestException as e:
logger.error(f"[ERR] download_file fail: {e}")
return None
file_path = os.path.join(destination_folder, default_name)
with open(file_path, 'wb') as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
logger.info(f"[OK] Downloaded: {file_path}")
return file_path
def get_filename_from_cd(self, cd: Optional[str], default_name: str) -> str:
if cd:
parts = cd.split(';')
for p in parts:
if "filename=" in p:
return p.split("=")[1].strip().strip('"')
return default_name
def get_model_info(self, model_id: str) -> dict:
try:
r = requests.get(f"{self.config.URLS['modelId']}{model_id}", headers=self.config.HEADERS)
r.raise_for_status()
return r.json()
except Exception as e:
logger.error(f"[ERR] get_model_info({model_id}): {e}")
return {}
def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
"""
rclone copy local_folder => cryptLocal:
=> diff => upload_folder => cleanup
"""
if not os.path.isdir(local_folder):
logger.error(f"[ERR] {local_folder} is not a directory.")
return None
encrypted_dir = os.path.join(os.getcwd(), "encrypted")
os.makedirs(encrypted_dir, exist_ok=True)
before = set(os.listdir(encrypted_dir))
try:
subprocess.run(["rclone", "copy", local_folder, "cryptLocal:", "--create-empty-src-dirs"], check=True)
logger.info(f"[OK] rclone copy {local_folder} => cryptLocal:")
except subprocess.CalledProcessError as e:
logger.error(f"[ERR] rclone copy failed: {e}")
return None
after = set(os.listdir(encrypted_dir))
diff = after - before
if not diff:
logger.error("[ERR] No new dir in ./encrypted after rclone copy.")
return None
if len(diff) > 1:
logger.warning(f"[WARN] multiple new dirs => {diff}")
enc_folder_name = diff.pop()
enc_folder_path = os.path.join(encrypted_dir, enc_folder_name)
if not os.path.isdir(enc_folder_path):
logger.error(f"[ERR] {enc_folder_path} is not a directory.")
return None
# upload_folder to HF
try:
self.api.upload_folder(
folder_path=enc_folder_path,
repo_id=self.repo_ids["current"],
path_in_repo=enc_folder_name
)
logger.info(f"[OK] uploaded {enc_folder_path} => {self.repo_ids['current']}:{enc_folder_name}")
except Exception as e:
logger.error(f"[ERR] HF upload_folder fail: {e}")
# cleanup local
shutil.rmtree(local_folder, ignore_errors=True)
shutil.rmtree(enc_folder_path, ignore_errors=True)
logger.info(f"[CLEANUP] removed {local_folder} & {enc_folder_path}")
return enc_folder_name
def process_model(self, model_url: str):
try:
model_id = model_url.rstrip("/").split("/")[-1]
model_info = self.get_model_info(model_id)
if not model_info or "modelVersions" not in model_info:
logger.error(f"No valid model info for ID {model_id}. Skipping.")
return
versions = model_info["modelVersions"]
if not versions:
logger.warning(f"No modelVersions found for ID {model_id}.")
return
# 1) ε˜γ«γƒ’γƒ‡γƒ«εγ‚’γ‚΅γƒ‹γ‚Ώγ‚€γ‚Ίγ—γŸγƒ‡γ‚£γƒ¬γ‚―γƒˆγƒͺにまとめる
folder_name = re.sub(r'[\\/*?:"<>|]', '_', model_info.get("name", "UnknownModel"))
# γƒ•γ‚©γƒ«γƒ€γŒζ—’γ«γ‚γ‚Œγ°ε‰Šι™€γ—γ€ζ–°θ¦γ«δ½œγ‚Šη›΄γ™ (δΈŠζ›Έγ)
if os.path.exists(folder_name):
shutil.rmtree(folder_name)
os.makedirs(folder_name, exist_ok=True)
# ダウンロード (ζœ€ζ–°/old_versions), 画像, HTML, model_info.json γͺど
self.download_and_process_versions(versions, folder_name)
self.download_images(versions, folder_name)
self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
self.save_model_info_json(model_info, folder_name)
# ζš—ε·εŒ–γ‚’γƒƒγƒ—γƒ­γƒΌγƒ‰
enc_subfolder = self.encrypt_and_upload_folder(folder_name)
if enc_subfolder is None:
enc_subfolder = "[ENCRYPT_FAILED]"
hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
except Exception as e:
logger.error(f"Error in process_model({model_url}): {e}")
def download_versions(self, model_versions: list, folder: str):
# すべて folder/ にまとめる or old_versions subfolder
# δΎ‹: ζœ€ζ–° => folder, old => folder/old_versions
if not model_versions:
return
latest = model_versions[0]
for f_info in latest.get("files", []):
url = f_info["downloadUrl"]
fname = f_info["name"]
self.download_file(url, folder, fname)
if len(model_versions) > 1:
ov_folder = os.path.join(folder, "old_versions")
os.makedirs(ov_folder, exist_ok=True)
for v in model_versions[1:]:
for f_info in v.get("files", []):
url = f_info["downloadUrl"]
fname = f_info["name"]
self.download_file(url, ov_folder, fname)
def download_images(self, model_versions: list, folder: str):
images_folder = os.path.join(folder, "images")
os.makedirs(images_folder, exist_ok=True)
for v in model_versions:
for im in v.get("images", []):
iurl = im["url"]
iname = os.path.basename(iurl)
self.download_file(iurl, images_folder, iname)
async def crawl(self):
while True:
try:
# HF Login
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
# model_list.log
mlist_path = hf_hub_download(self.repo_ids["model_list"], self.config.LIST_FILE)
shutil.copyfile(mlist_path, f"./{self.config.LIST_FILE}")
# log_file
log_path = hf_hub_download(self.repo_ids["log"], self.config.LOG_FILE)
shutil.copyfile(log_path, f"./{self.config.LOG_FILE}")
# read logs
with open(self.config.LOG_FILE, "r", encoding="utf-8") as f:
lines = f.read().splitlines()
old_models = json.loads(lines[0]) if len(lines)>0 else []
self.repo_ids["current"] = lines[1] if len(lines)>1 else ""
# get newest
r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
r.raise_for_status()
items = r.json().get("items", [])
ids = [it["id"] for it in items if "id" in it]
new_ids = list(set(ids)-set(old_models))
if new_ids:
mid = new_ids[0]
for attempt in range(1,6):
try:
self.process_model(str(mid))
break
except Exception as e:
logger.error(f"[ERR] process_model {mid}, attempt {attempt}: {e}")
if attempt==5:
logger.error("Skipping model after 5 fails")
else:
await asyncio.sleep(2)
# update logs
old_models.append(mid)
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
f.write(json.dumps(old_models)+"\n")
f.write(self.repo_ids["current"]+"\n")
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
self.upload_file(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
else:
# no new
with open(self.config.LOG_FILE,"w",encoding="utf-8") as f:
f.write(json.dumps(ids)+"\n")
f.write(self.repo_ids["current"]+"\n")
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
logger.info("No new models => wait 60s")
await asyncio.sleep(60)
continue
except Exception as e:
logger.error(f"[ERR] crawl loop => {e}")
await asyncio.sleep(300)
# === FastAPI
config = Config()
crawler = CivitAICrawler(config)
app = crawler.app