cv_test / main.py
ttttdiva's picture
Update main.py
f30744e verified
raw
history blame
26 kB
import asyncio
import base64
import datetime
import json
import logging
import os
import re
import shutil
import subprocess
import time
from typing import Optional
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from fastapi import FastAPI
from huggingface_hub import HfApi, create_repo, hf_hub_download, login
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class Config:
"""設定用のクラス"""
HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
LOG_FILE = "civitai_backup.log"
LIST_FILE = "model_list.log"
REPO_IDS = {
"log": "ttttdiva/CivitAI_log_test",
"model_list": "ttttdiva/CivitAI_model_info_test",
"current": ""
}
URLS = {
"latest": "https://civitai.com/api/v1/models?sort=Newest",
"modelPage": "https://civitai.com/models/",
"modelId": "https://civitai.com/api/v1/models/",
"modelVersionId": "https://civitai.com/api/v1/model-versions/",
"hash": "https://civitai.com/api/v1/model-versions/by-hash/"
}
JST = datetime.timezone(datetime.timedelta(hours=9))
UA = UserAgent()
HEADERS = {
'Authorization': f'Bearer {CIVITAI_API_TOKEN}',
'User-Agent': 'civitai-crawler/1.0',
"Content-Type": "application/json"
}
# rclone 用の追加設定
RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
ENCRYPTED_DIR = "/home/user/app/encrypted"
class CivitAICrawler:
"""CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス"""
def __init__(self, config: Config):
self.config = config
self.api = HfApi()
self.app = FastAPI()
self.repo_ids = self.config.REPO_IDS.copy()
self.jst = self.config.JST
# rclone 設定の読み込み
self.setup_rclone_conf()
self.setup_routes()
def setup_routes(self):
@self.app.get("/")
def read_root():
now = str(datetime.datetime.now(self.jst))
description = (
f"CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。\n"
f"model_list.log や civitai_backup.log は暗号化しないでアップロードします。\n"
f"モデルのフォルダやファイルは暗号化してアップロードします。\n"
f"Status: {now} + currently running :D\n"
)
return description
@self.app.on_event("startup")
async def startup_event():
asyncio.create_task(self.crawl())
# ============================================================================
# rclone 設定 & 暗号化アップロード処理
# ============================================================================
def setup_rclone_conf(self):
if not self.config.RCLONE_CONF_BASE64:
logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
return
os.makedirs(".rclone_config", exist_ok=True)
conf_path = os.path.join(".rclone_config", "rclone.conf")
with open(conf_path, "wb") as f:
f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
os.environ["RCLONE_CONFIG"] = conf_path
logger.info(f"[INFO] rclone.conf created at: {conf_path}")
def encrypt_with_rclone(self, local_path: str):
"""フォルダ or ファイルを cryptLocal: にコピーし、フォルダ名・ファイル名を暗号化"""
if not os.path.exists(local_path):
raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
# 事前に暗号先ディレクトリを掃除
if os.path.isdir(self.config.ENCRYPTED_DIR):
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
top_level_name = os.path.basename(local_path.rstrip("/")) or "unnamed"
cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"]
logger.info(f"[INFO] Running: {' '.join(cmd)}")
subprocess.run(cmd, check=True)
logger.info(f"[OK] rclone copy => cryptLocal:{top_level_name}")
if not os.path.isdir(self.config.ENCRYPTED_DIR):
raise FileNotFoundError(
f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
)
def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
"""self.config.ENCRYPTED_DIR 以下の暗号化済ファイルを再帰的にアップロード"""
max_retries = 5
for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
for fn in files:
encrypted_file_path = os.path.join(root, fn)
if not os.path.isfile(encrypted_file_path):
continue
relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
attempt = 0
while attempt < max_retries:
try:
self.api.upload_file(
path_or_fileobj=encrypted_file_path,
repo_id=repo_id,
path_in_repo=upload_path_in_repo
)
logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}")
break
except Exception as e:
attempt += 1
error_message = str(e)
# 429 Rate-limit with "in XX minutes"
if "rate-limited" in error_message and "minutes" in error_message:
import re
match = re.search(r"in (\d+) minutes?", error_message)
if match:
minutes = int(match.group(1)) + 1
logger.warning(f"Rate-limited. Waiting {minutes} minutes...")
time.sleep(minutes * 60)
attempt -= 1
continue
# 1時間待機
if "you can retry this action in about 1 hour" in error_message:
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
time.sleep(3600)
attempt -= 1
continue
# ファイル上限
if "over the limit of 100000 files" in error_message:
logger.warning("Repository file limit exceeded. Creating a new repository...")
self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
attempt = 0
repo_id = self.repo_ids['current']
continue
if attempt < max_retries:
logger.warning(
f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
)
else:
logger.error(f"Failed to upload after {max_retries} attempts: {encrypted_file_path}")
raise
def upload_folder_encrypted(self, folder_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
"""フォルダを丸ごと暗号化してアップロード (=フォルダ名も暗号化)"""
if not repo_id:
repo_id = self.repo_ids['current']
self.encrypt_with_rclone(folder_path)
self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo)
# 暗号化フォルダを削除
if os.path.isdir(self.config.ENCRYPTED_DIR):
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
# ============================================================================
# 単一ファイルを暗号化アップロードしてローカル削除 (old_versions用)
# ============================================================================
def upload_file_encrypted_one_by_one(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
"""
単一ファイルを暗号化アップロードし、アップロード後にローカルファイルを削除。
"""
if not repo_id:
repo_id = self.repo_ids['current']
self.encrypt_with_rclone(file_path)
self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo)
# 暗号化ディレクトリを削除
if os.path.isdir(self.config.ENCRYPTED_DIR):
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
# ローカルの実ファイル削除
if os.path.exists(file_path):
os.remove(file_path)
@staticmethod
def increment_repo_name(repo_id: str) -> str:
match = re.search(r'(\d+)$', repo_id)
if match:
number = int(match.group(1)) + 1
return re.sub(r'\d+$', str(number), repo_id)
else:
return f"{repo_id}1"
# ============================================================================
# ログや model_list.log は生アップロード
# ============================================================================
def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
if repo_id is None:
repo_id = self.repo_ids['current']
if path_in_repo is None:
path_in_repo = os.path.basename(file_path)
max_retries = 5
attempt = 0
while attempt < max_retries:
try:
self.api.upload_file(
path_or_fileobj=file_path,
repo_id=repo_id,
path_in_repo=path_in_repo
)
logger.info(f"[OK] Uploaded {file_path} => {repo_id}/{path_in_repo}")
return
except Exception as e:
attempt += 1
error_message = str(e)
if "over the limit of 100000 files" in error_message:
logger.warning("Repository file limit exceeded, creating a new repository.")
self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
attempt = 0
repo_id = self.repo_ids['current']
continue
elif "you can retry this action in about 1 hour" in error_message:
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
time.sleep(3600)
attempt -= 1
else:
if attempt < max_retries:
logger.warning(f"Failed to upload raw file {file_path}, retry {attempt}/{max_retries}...")
else:
logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}")
raise
# ============================================================================
# ダウンロード処理
# ============================================================================
@staticmethod
def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
if content_disposition:
parts = content_disposition.split(';')
for part in parts:
if "filename=" in part:
return part.split("=")[1].strip().strip('"')
return default_name
def download_file(self, url: str, destination_folder: str, default_name: str):
try:
response = requests.get(url, headers=self.config.HEADERS, stream=True)
response.raise_for_status()
except requests.RequestException as e:
logger.error(f"Failed to download file from {url}: {e}")
return None
filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
file_path = os.path.join(destination_folder, filename)
with open(file_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
logger.info(f"Download completed: {file_path}")
return file_path
# ============================================================================
# 古いバージョンのみ1ファイルずつアップロード
# ============================================================================
def download_old_versions_one_by_one(self, version_list: list, folder: str):
"""version_list[1:] を対象に、モデルファイルを 1ファイルDL→upload→削除 を繰り返す"""
if len(version_list) <= 1:
return
old_versions_folder = os.path.join(folder, "old_versions")
os.makedirs(old_versions_folder, exist_ok=True)
for version in version_list[1:]:
for file_info in version.get("files", []):
download_url = file_info["downloadUrl"]
file_name = file_info["name"]
local_path = self.download_file(download_url, old_versions_folder, file_name)
if not local_path or not os.path.exists(local_path):
logger.error(f"Failed to download or file not found: {file_name}")
continue
# 1つアップロードして削除
# path_in_repo を空文字にすればフォルダ名も暗号化される(トップレベル)
# もしサブフォルダにまとめたいなら "old_versions" とか指定する
self.upload_file_encrypted_one_by_one(local_path, path_in_repo="")
# old_versions フォルダ内は空になったはずなので削除
if os.path.exists(old_versions_folder):
shutil.rmtree(old_versions_folder, ignore_errors=True)
# ============================================================================
# 従来どおり「最新バージョンのファイル一式 + images」フォルダを一括DL→アップロード
# ============================================================================
def download_model(self, model_versions: list, folder: str):
"""最新バージョンを一括ダウンロード (フォルダにまとめる)"""
latest_version = model_versions[0]
latest_files = latest_version["files"]
for file_info in latest_files:
download_url = file_info["downloadUrl"]
file_name = file_info["name"]
local_path = self.download_file(download_url, folder, file_name)
if local_path and os.path.exists(local_path):
logger.info(f"Downloaded {file_name}")
else:
logger.warning(f"Could not download {file_name}")
def download_images(self, model_versions: list, folder: str):
images_folder = os.path.join(folder, "images")
os.makedirs(images_folder, exist_ok=True)
images = []
for version in model_versions:
for img in version.get("images", []):
images.append(img["url"])
for image_url in images:
image_name = os.path.basename(image_url) + ".png"
local_path = os.path.join(images_folder, image_name)
try:
resp = requests.get(image_url, stream=True)
resp.raise_for_status()
with open(local_path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
logger.info(f"Downloaded image: {local_path}")
except Exception as e:
logger.error(f"Error downloading image {image_url}: {e}")
def save_html_content(self, url: str, folder: str):
try:
response = requests.get(url)
response.raise_for_status()
html_path = os.path.join(folder, os.path.basename(folder) + ".html")
with open(html_path, 'w', encoding='utf-8') as file:
file.write(response.text)
except Exception as e:
logger.error(f"Error saving HTML content for URL {url}: {e}")
@staticmethod
def save_model_info(model_info: dict, folder: str):
with open(os.path.join(folder, "model_info.json"), "w", encoding="utf-8") as file:
json.dump(model_info, file, indent=2)
# ============================================================================
# model_list.log
# ============================================================================
def read_model_list(self):
model_list = {}
try:
with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split(": ", 1)
if len(parts) == 2:
stored_id, stored_url = parts
model_list[stored_id] = stored_url
except Exception as e:
logger.error(f"Failed to read model list: {e}")
return model_list
# ============================================================================
# model 情報取得
# ============================================================================
def get_model_info(self, model_id: str) -> dict:
try:
url = self.config.URLS["modelId"] + str(model_id)
resp = requests.get(url, headers=self.config.HEADERS)
resp.raise_for_status()
return resp.json()
except requests.RequestException as e:
logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
return {}
# ============================================================================
# メイン処理: 最新ファイル + images はフォルダごとアップロード。old_versions は1ファイルずつ。
# ============================================================================
def process_model(self, model_url: str):
try:
model_id = model_url.rstrip("/").split("/")[-1]
model_info = self.get_model_info(model_id)
if not model_info:
logger.error(f"No model_info returned for {model_id}")
return
model_versions = model_info.get("modelVersions", [])
if not model_versions:
logger.error(f"No modelVersions in model info {model_id}")
return
latest_version = model_versions[0]
model_file = next((file for file in latest_version["files"] if file.get('type') == 'Model'), None)
if model_file:
latest_filename = model_file['name']
folder = os.path.splitext(latest_filename)[0]
else:
first_file = latest_version["files"][0]
latest_filename = first_file['name']
folder = os.path.splitext(latest_filename)[0]
logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
os.makedirs(folder, exist_ok=True)
# すでにアップ済みかどうか model_list.log でチェック (モデル名ベース)
model_list = self.read_model_list()
modelpage_name = model_info.get("name", f"Model_{model_id}")
if modelpage_name in model_list.values():
logger.info(f"Model '{modelpage_name}' already in model_list. Skipping.")
# 必要ならreturn
# 最新バージョン (まとめてダウンロード)
self.download_model(model_versions, folder)
# 画像 (imagesフォルダまるごとダウンロード)
self.download_images(model_versions, folder)
# HTML & model_info.json
self.save_html_content(self.config.URLS["modelPage"] + str(model_id), folder)
self.save_model_info(model_info, folder)
# 古いバージョンのみ「1つずつアップロード&削除」
self.download_old_versions_one_by_one(model_versions, folder)
# ↑で old_versions は空になった → あとはフォルダに残っているのは
# 最新バージョンファイル・imagesフォルダ・model_info.json・HTML など
# "folder" 自体を暗号化アップロード (= images フォルダごとアップロード)
# path_in_repo を "" にすればフォルダ名も暗号化される
self.upload_folder_encrypted(folder, path_in_repo="")
# ローカルフォルダ削除
if os.path.exists(folder):
shutil.rmtree(folder)
# model_list.log に追記
# HF上では folder名 も暗号化されるが、ここでは元の "modelpage_name" と
# HFへのトップフォルダ参照URLを書く
model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main"
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
f.write(f"{modelpage_name}: {model_hf_url}\n")
except Exception as e:
logger.error(f"Unexpected error processing model ({model_url}): {e}")
# ============================================================================
# crawl
# ============================================================================
async def crawl(self):
while True:
try:
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
# 最新の model_list.log & civitai_backup.log をダウンロード
model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
# ログ読み込み
with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
lines = file.read().splitlines()
old_models = json.loads(lines[0]) if len(lines) > 0 else []
self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
# 新着モデル確認
response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
response.raise_for_status()
latest_models = response.json().get("items", [])
latest_model_ids = [m["id"] for m in latest_models if "id" in m]
# 差集合
new_models = list(set(latest_model_ids) - set(old_models))
if new_models:
logger.info(f"New models found: {new_models}")
model_id = new_models[0]
for attempt in range(1, 6):
try:
self.process_model(f"{self.config.URLS['modelId']}{model_id}")
break
except Exception as e:
logger.error(f"Failed to process model ID {model_id} (Attempt {attempt}/5): {e}")
if attempt == 5:
logger.error(f"Skipping model ID {model_id} after 5 failed attempts.")
else:
await asyncio.sleep(2)
else:
# 新モデルなし
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
f.write(json.dumps(latest_model_ids) + "\n")
f.write(f"{self.repo_ids['current']}\n")
logger.info(f"Updated log file: {self.config.LOG_FILE}")
self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
logger.info("Uploaded log file to repository (unencrypted).")
logger.info("No new models found.")
await asyncio.sleep(60)
continue
# 1件アップロードに成功したら old_models に追加
old_models.append(model_id)
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
f.write(json.dumps(old_models) + "\n")
f.write(f"{self.repo_ids['current']}\n")
logger.info(f"Updated log file with new model ID: {model_id}")
# ログと model_list.log をアップ
self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
except Exception as e:
logger.error(f"Error during crawling: {e}")
await asyncio.sleep(300)
# 実行
config = Config()
crawler = CivitAICrawler(config)
app = crawler.app