Upload main.py
Browse files
main.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import asyncio
|
|
|
2 |
import datetime
|
3 |
import json
|
4 |
import logging
|
@@ -13,20 +14,23 @@ import requests
|
|
13 |
from bs4 import BeautifulSoup
|
14 |
from fake_useragent import UserAgent
|
15 |
from fastapi import FastAPI
|
16 |
-
from huggingface_hub import HfApi, hf_hub_download, login
|
17 |
|
|
|
18 |
logging.basicConfig(level=logging.INFO)
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
|
|
21 |
class Config:
|
|
|
22 |
HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
|
23 |
CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
|
24 |
LOG_FILE = "civitai_backup.log"
|
25 |
LIST_FILE = "model_list.log"
|
26 |
REPO_IDS = {
|
27 |
-
"log": "ttttdiva/CivitAI_log_test",
|
28 |
"model_list": "ttttdiva/CivitAI_model_info_test",
|
29 |
-
"current": ""
|
30 |
}
|
31 |
URLS = {
|
32 |
"latest": "https://civitai.com/api/v1/models?sort=Newest",
|
@@ -43,281 +47,535 @@ class Config:
|
|
43 |
"Content-Type": "application/json"
|
44 |
}
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
class CivitAICrawler:
|
|
|
|
|
47 |
def __init__(self, config: Config):
|
48 |
self.config = config
|
49 |
self.api = HfApi()
|
50 |
self.app = FastAPI()
|
51 |
self.repo_ids = self.config.REPO_IDS.copy()
|
52 |
self.jst = self.config.JST
|
|
|
|
|
53 |
self.setup_rclone_conf()
|
54 |
-
self.setup_routes()
|
55 |
|
56 |
-
|
57 |
-
import base64
|
58 |
-
rclone_b64 = os.environ.get("RCLONE_CONF_BASE64", "")
|
59 |
-
if rclone_b64:
|
60 |
-
conf_dir = ".rclone_config"
|
61 |
-
os.makedirs(conf_dir, exist_ok=True)
|
62 |
-
conf_path = os.path.join(conf_dir, "rclone.conf")
|
63 |
-
with open(conf_path, "wb") as f:
|
64 |
-
f.write(base64.b64decode(rclone_b64))
|
65 |
-
os.environ["RCLONE_CONFIG"] = conf_path
|
66 |
-
logger.info(f"[OK] Created rclone.conf => {conf_path}")
|
67 |
-
else:
|
68 |
-
logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
|
69 |
|
70 |
def setup_routes(self):
|
|
|
|
|
71 |
@self.app.get("/")
|
72 |
def read_root():
|
73 |
now = str(datetime.datetime.now(self.jst))
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
|
79 |
@self.app.on_event("startup")
|
80 |
async def startup_event():
|
81 |
asyncio.create_task(self.crawl())
|
82 |
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
try:
|
86 |
-
|
87 |
-
|
88 |
except requests.RequestException as e:
|
89 |
-
logger.error(f"
|
90 |
-
return
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
for chunk in r.iter_content(chunk_size=8192):
|
95 |
-
f.write(chunk)
|
96 |
-
logger.info(f"[OK] Downloaded => {file_path}")
|
97 |
-
return file_path
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
path_in_repo = os.path.basename(file_path)
|
104 |
|
|
|
|
|
105 |
try:
|
106 |
-
self.
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
)
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
return None
|
134 |
|
135 |
-
|
136 |
-
|
|
|
|
|
137 |
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
try:
|
142 |
-
|
143 |
-
|
|
|
|
|
144 |
except subprocess.CalledProcessError as e:
|
145 |
-
logger.error(f"
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
logger.warning(f"[WARN] multiple new dirs => {diff}")
|
155 |
-
enc_name = diff.pop()
|
156 |
-
enc_path = os.path.join(encrypted_dir, enc_name)
|
157 |
-
if not os.path.isdir(enc_path):
|
158 |
-
logger.error(f"[ERR] {enc_path} is not a directory.")
|
159 |
-
return None
|
160 |
-
|
161 |
-
# HF upload folder
|
162 |
try:
|
163 |
-
|
|
|
|
|
|
|
|
|
164 |
except Exception as e:
|
165 |
-
logger.error(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
-
#
|
173 |
-
|
174 |
|
175 |
-
|
176 |
-
|
177 |
-
return
|
178 |
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
fname = f_info["name"]
|
183 |
-
self.download_file(url, folder, fname)
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
self.download_file(url, ov_folder, fname)
|
193 |
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
try:
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
200 |
except Exception as e:
|
201 |
-
logger.error(f"
|
202 |
return {}
|
203 |
|
204 |
-
def
|
205 |
-
"""
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
for img_info in version.get("images", []):
|
214 |
-
img_url = img_info["url"]
|
215 |
-
filename = os.path.basename(img_url)
|
216 |
-
self.download_file(img_url, images_folder, filename)
|
217 |
-
|
218 |
-
def process_model(self, model_id: str):
|
219 |
-
info = self.get_model_info(model_id)
|
220 |
-
if not info or "modelVersions" not in info:
|
221 |
-
logger.error(f"[ERR] No modelVersions for {model_id}")
|
222 |
-
return
|
223 |
-
|
224 |
-
versions = info["modelVersions"]
|
225 |
-
base_dir = "local_models"
|
226 |
-
os.makedirs(base_dir, exist_ok=True)
|
227 |
-
|
228 |
-
# モデル名
|
229 |
-
model_name = info.get("name", f"ID_{model_id}")
|
230 |
-
safe_name = re.sub(r'[\\/*?:"<>|]', '_', model_name) # OSで使えない文字を _
|
231 |
-
folder_path = os.path.join(base_dir, safe_name)
|
232 |
-
if os.path.exists(folder_path):
|
233 |
-
shutil.rmtree(folder_path)
|
234 |
-
os.makedirs(folder_path, exist_ok=True)
|
235 |
-
logger.info(f"[OK] Created local folder => {folder_path}")
|
236 |
-
|
237 |
-
# ダウンロード
|
238 |
-
self.download_and_process_versions(versions, folder_path)
|
239 |
-
self.download_images(versions, folder_path)
|
240 |
-
|
241 |
-
# === 暗号化&アップロード ===
|
242 |
-
logger.info(f"[DEBUG] encrypt_and_upload_folder => {folder_path}")
|
243 |
-
enc_subfolder = self.encrypt_and_upload_folder(folder_path)
|
244 |
-
if enc_subfolder is None:
|
245 |
-
# 失敗
|
246 |
-
enc_subfolder = "[ENCRYPT_FAILED]"
|
247 |
-
else:
|
248 |
-
logger.info(f"[OK] Encrypted & uploaded => {enc_subfolder}")
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
|
253 |
-
model_list_line = f"{model_name} (ID:{model_id}): {hf_url}\n"
|
254 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
|
256 |
-
f.write(
|
257 |
-
logger.info(f"[OK] Wrote to model_list.log => {model_list_line.strip()}")
|
258 |
-
except Exception as e:
|
259 |
-
logger.error(f"[ERR] writing model_list.log => {e}")
|
260 |
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
263 |
|
264 |
async def crawl(self):
|
|
|
265 |
while True:
|
266 |
try:
|
267 |
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
|
268 |
|
269 |
-
|
|
|
270 |
shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
|
271 |
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
290 |
try:
|
291 |
-
self.process_model(
|
292 |
break
|
293 |
except Exception as e:
|
294 |
-
logger.error(f"
|
295 |
-
if attempt==5:
|
296 |
-
logger.error(f"
|
297 |
else:
|
298 |
await asyncio.sleep(2)
|
299 |
-
|
300 |
-
old_models.append(mid)
|
301 |
-
with open(self.config.LOG_FILE,'w',encoding='utf-8') as f:
|
302 |
-
f.write(json.dumps(old_models)+"\n")
|
303 |
-
f.write(self.repo_ids["current"]+"\n")
|
304 |
-
|
305 |
-
# アップロードログ
|
306 |
-
self.upload_file(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
|
307 |
else:
|
308 |
-
|
309 |
-
|
310 |
-
f.write(
|
311 |
-
|
312 |
-
logger.info("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
313 |
await asyncio.sleep(60)
|
314 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
except Exception as e:
|
316 |
-
logger.error(f"
|
317 |
await asyncio.sleep(300)
|
318 |
|
319 |
|
320 |
-
# FastAPI
|
321 |
config = Config()
|
322 |
crawler = CivitAICrawler(config)
|
323 |
app = crawler.app
|
|
|
1 |
import asyncio
|
2 |
+
import base64
|
3 |
import datetime
|
4 |
import json
|
5 |
import logging
|
|
|
14 |
from bs4 import BeautifulSoup
|
15 |
from fake_useragent import UserAgent
|
16 |
from fastapi import FastAPI
|
17 |
+
from huggingface_hub import HfApi, create_repo, hf_hub_download, login
|
18 |
|
19 |
+
# ロギングの設定
|
20 |
logging.basicConfig(level=logging.INFO)
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
23 |
+
|
24 |
class Config:
|
25 |
+
"""設定用のクラス"""
|
26 |
HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
|
27 |
CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
|
28 |
LOG_FILE = "civitai_backup.log"
|
29 |
LIST_FILE = "model_list.log"
|
30 |
REPO_IDS = {
|
31 |
+
"log": "ttttdiva/CivitAI_log_test",
|
32 |
"model_list": "ttttdiva/CivitAI_model_info_test",
|
33 |
+
"current": ""
|
34 |
}
|
35 |
URLS = {
|
36 |
"latest": "https://civitai.com/api/v1/models?sort=Newest",
|
|
|
47 |
"Content-Type": "application/json"
|
48 |
}
|
49 |
|
50 |
+
# ===== rclone 用の追加設定 =====
|
51 |
+
# (環境変数 RCLONE_CONF_BASE64 に rclone.conf をbase64エンコードした文字列を設定しておく想定)
|
52 |
+
RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
|
53 |
+
# 暗号化されたファイルが出力されるローカルディレクトリ(cryptLocal: の実体)
|
54 |
+
ENCRYPTED_DIR = "/home/user/app/encrypted"
|
55 |
+
|
56 |
+
|
57 |
class CivitAICrawler:
|
58 |
+
"""CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス(rcloneで暗号化対応版)"""
|
59 |
+
|
60 |
def __init__(self, config: Config):
|
61 |
self.config = config
|
62 |
self.api = HfApi()
|
63 |
self.app = FastAPI()
|
64 |
self.repo_ids = self.config.REPO_IDS.copy()
|
65 |
self.jst = self.config.JST
|
66 |
+
|
67 |
+
# rclone のセットアップ
|
68 |
self.setup_rclone_conf()
|
|
|
69 |
|
70 |
+
self.setup_routes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def setup_routes(self):
|
73 |
+
"""FastAPIのルーティングを設定する。"""
|
74 |
+
|
75 |
@self.app.get("/")
|
76 |
def read_root():
|
77 |
now = str(datetime.datetime.now(self.jst))
|
78 |
+
description = f"""
|
79 |
+
CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするspaceです。
|
80 |
+
モデルページ名とバックアップURLの紐づきはhttps://huggingface.co/{self.repo_ids['model_list']}/blob/main/model_list.logからどうぞ
|
81 |
+
たまに覗いてもらえると動き続けると思います。
|
82 |
+
再起動が必要になっている場合はRestartボタンを押してもらえると助かります。
|
83 |
+
Status: {now} + currently running :D
|
84 |
+
"""
|
85 |
+
return description
|
86 |
|
87 |
@self.app.on_event("startup")
|
88 |
async def startup_event():
|
89 |
asyncio.create_task(self.crawl())
|
90 |
|
91 |
+
# =============================
|
92 |
+
# rclone 周りのヘルパー関数
|
93 |
+
# =============================
|
94 |
+
def setup_rclone_conf(self):
|
95 |
+
"""環境変数 RCLONE_CONF_BASE64 から rclone.conf を生成し、RCLONE_CONFIG 環境変数を設定"""
|
96 |
+
if not self.config.RCLONE_CONF_BASE64:
|
97 |
+
logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
|
98 |
+
return
|
99 |
+
|
100 |
+
os.makedirs(".rclone_config", exist_ok=True)
|
101 |
+
conf_path = os.path.join(".rclone_config", "rclone.conf")
|
102 |
+
with open(conf_path, "wb") as f:
|
103 |
+
f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
|
104 |
+
|
105 |
+
os.environ["RCLONE_CONFIG"] = conf_path
|
106 |
+
logger.info(f"[INFO] rclone.conf created at: {conf_path}")
|
107 |
+
|
108 |
+
def encrypt_with_rclone(self, local_path: str, is_file: bool = True):
|
109 |
+
"""
|
110 |
+
指定ファイル or フォルダを cryptLocal: へコピーし、暗号化ファイルを self.config.ENCRYPTED_DIR に生成する。
|
111 |
+
rclone copy の引数にフォルダパスやファイルパスを指定して利用可能。
|
112 |
+
"""
|
113 |
+
if not os.path.exists(local_path):
|
114 |
+
raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
|
115 |
+
|
116 |
+
# 事前に暗号先ディレクトリをクリーンアップ(不要なら削除する)
|
117 |
+
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
118 |
+
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
119 |
+
|
120 |
+
# rclone コマンドの実行
|
121 |
+
cmd = ["rclone", "copy", local_path, "cryptLocal:", "-v"]
|
122 |
+
logger.info(f"[INFO] Running: {' '.join(cmd)}")
|
123 |
+
subprocess.run(cmd, check=True)
|
124 |
+
logger.info(f"[OK] rclone copy {local_path} => cryptLocal:")
|
125 |
+
|
126 |
+
if not os.path.isdir(self.config.ENCRYPTED_DIR):
|
127 |
+
raise FileNotFoundError(
|
128 |
+
f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
|
129 |
+
)
|
130 |
+
|
131 |
+
def upload_encrypted_files(self, repo_id: str, path_in_repo: str = None):
|
132 |
+
"""
|
133 |
+
self.config.ENCRYPTED_DIR にある暗号化済みファイルをまとめて Hugging Face にアップロードする。
|
134 |
+
もとがフォルダの場合はサブディレクトリも含めて再帰的にアップロードする。
|
135 |
+
"""
|
136 |
+
if not path_in_repo:
|
137 |
+
path_in_repo = ""
|
138 |
+
|
139 |
+
max_retries = 5
|
140 |
+
|
141 |
+
# 再帰的に walk
|
142 |
+
for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
|
143 |
+
for fn in files:
|
144 |
+
encrypted_file_path = os.path.join(root, fn)
|
145 |
+
if not os.path.isfile(encrypted_file_path):
|
146 |
+
continue
|
147 |
+
|
148 |
+
# 元ディレクトリ相対のパスを生成
|
149 |
+
relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
|
150 |
+
# Hugging Face 上に保存するときのフルパス
|
151 |
+
upload_path_in_repo = os.path.join(path_in_repo, relative_path)
|
152 |
+
|
153 |
+
# HFへのアップロードを試行 (over the limitなどの例外をリトライする)
|
154 |
+
attempt = 0
|
155 |
+
while attempt < max_retries:
|
156 |
+
try:
|
157 |
+
self.api.upload_file(
|
158 |
+
path_or_fileobj=encrypted_file_path,
|
159 |
+
repo_id=repo_id,
|
160 |
+
path_in_repo=upload_path_in_repo
|
161 |
+
)
|
162 |
+
logger.info(f"[OK] Uploaded {encrypted_file_path} to {repo_id}/{upload_path_in_repo}")
|
163 |
+
break # 成功したらループ抜け
|
164 |
+
except Exception as e:
|
165 |
+
attempt += 1
|
166 |
+
error_message = str(e)
|
167 |
+
if "over the limit of 100000 files" in error_message:
|
168 |
+
logger.warning("Repository file limit exceeded, creating a new repository.")
|
169 |
+
self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
|
170 |
+
self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
|
171 |
+
# リポジトリを変えたので attempt をリセット
|
172 |
+
attempt = 0
|
173 |
+
repo_id = self.repo_ids['current']
|
174 |
+
continue
|
175 |
+
elif "you can retry this action in about 1 hour" in error_message:
|
176 |
+
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
|
177 |
+
time.sleep(3600)
|
178 |
+
attempt -= 1 # この場合はリトライ回数をカウントしない
|
179 |
+
else:
|
180 |
+
if attempt < max_retries:
|
181 |
+
logger.warning(f"Failed to upload file {encrypted_file_path}, retrying... {attempt}/{max_retries}")
|
182 |
+
else:
|
183 |
+
logger.error(f"Failed to upload file after {max_retries} attempts: {encrypted_file_path}")
|
184 |
+
raise
|
185 |
+
|
186 |
+
# =============================
|
187 |
+
# ここから既存処理
|
188 |
+
# =============================
|
189 |
+
|
190 |
+
@staticmethod
|
191 |
+
def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
|
192 |
+
"""Content-Dispositionヘッダーからファイル名を取得する。"""
|
193 |
+
if content_disposition:
|
194 |
+
parts = content_disposition.split(';')
|
195 |
+
for part in parts:
|
196 |
+
if "filename=" in part:
|
197 |
+
return part.split("=")[1].strip().strip('"')
|
198 |
+
return default_name
|
199 |
+
|
200 |
+
def download_file(self, url: str, destination_folder: str, default_name: str):
|
201 |
+
"""指定されたURLからファイルをダウンロードし、指定されたフォルダに保存する。"""
|
202 |
try:
|
203 |
+
response = requests.get(url, headers=self.config.HEADERS, stream=True)
|
204 |
+
response.raise_for_status()
|
205 |
except requests.RequestException as e:
|
206 |
+
logger.error(f"Failed to download file from {url}: {e}")
|
207 |
+
return
|
208 |
|
209 |
+
filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
|
210 |
+
file_path = os.path.join(destination_folder, filename)
|
|
|
|
|
|
|
|
|
211 |
|
212 |
+
with open(file_path, 'wb') as file:
|
213 |
+
for chunk in response.iter_content(chunk_size=8192):
|
214 |
+
file.write(chunk)
|
215 |
+
logger.info(f"Download completed: {file_path}")
|
|
|
216 |
|
217 |
+
def get_model_info(self, model_id: str) -> dict:
|
218 |
+
"""モデルの情報を取得する。"""
|
219 |
try:
|
220 |
+
response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
|
221 |
+
response.raise_for_status()
|
222 |
+
return response.json()
|
223 |
+
except requests.RequestException as e:
|
224 |
+
logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
|
225 |
+
|
226 |
+
def download_model(self, model_versions: list, folder: str, existing_old_version_files: list = []):
|
227 |
+
"""モデルのバージョンをダウンロードする。"""
|
228 |
+
latest_version = model_versions[0]
|
229 |
+
latest_files = latest_version["files"]
|
230 |
+
for file_info in latest_files:
|
231 |
+
download_url = file_info["downloadUrl"]
|
232 |
+
file_name = file_info["name"]
|
233 |
+
login_detected_count = 0
|
234 |
+
|
235 |
+
while login_detected_count < 5:
|
236 |
+
try:
|
237 |
+
self.download_file(download_url, folder, file_name)
|
238 |
+
except Exception as e:
|
239 |
+
logger.error(f"Exception occurred while downloading {file_name}: {e}")
|
240 |
+
login_detected_count += 1
|
241 |
+
continue
|
242 |
|
243 |
+
if "login" in os.listdir(folder):
|
244 |
+
login_detected_count += 1
|
245 |
+
logger.warning(f"'login' file found. Will try again. ({login_detected_count}/5)")
|
246 |
+
os.remove(os.path.join(folder, "login"))
|
247 |
+
else:
|
248 |
+
logger.info(f"Successfully downloaded {file_name}")
|
249 |
+
break
|
250 |
+
|
251 |
+
if login_detected_count >= 5:
|
252 |
+
dummy_file_name = f"{file_name}.download_failed"
|
253 |
+
dummy_file_path = os.path.join(folder, dummy_file_name)
|
254 |
+
try:
|
255 |
+
with open(dummy_file_path, "w") as f:
|
256 |
+
f.write("Download failed after 5 attempts.")
|
257 |
+
logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
|
258 |
+
except Exception as e:
|
259 |
+
logger.error(f"Failed to create dummy file for {file_name}: {e}")
|
260 |
+
|
261 |
+
# 古いバージョンのダウンロード
|
262 |
+
if len(model_versions) > 1:
|
263 |
+
old_versions_folder = os.path.join(folder, "old_versions")
|
264 |
+
os.makedirs(old_versions_folder, exist_ok=True)
|
265 |
+
for version in model_versions[1:]:
|
266 |
+
for file_info in version["files"]:
|
267 |
+
file_name = file_info["name"]
|
268 |
+
if file_name in existing_old_version_files:
|
269 |
+
logger.info(f"Skipping download of existing old version file: {file_name}")
|
270 |
+
continue
|
271 |
+
download_url = file_info["downloadUrl"]
|
272 |
+
local_file_path = os.path.join(old_versions_folder, file_name)
|
273 |
+
login_detected_count = 0
|
274 |
+
|
275 |
+
while login_detected_count < 5:
|
276 |
+
try:
|
277 |
+
self.download_file(download_url, old_versions_folder, file_name)
|
278 |
+
except Exception as e:
|
279 |
+
logger.error(f"Exception occurred while downloading {file_name}: {e}")
|
280 |
+
login_detected_count += 1
|
281 |
+
continue
|
282 |
+
|
283 |
+
if "login" in os.listdir(old_versions_folder):
|
284 |
+
login_detected_count += 1
|
285 |
+
logger.warning(f"'login' file found while downloading {file_name}. Will try again. ({login_detected_count}/5)")
|
286 |
+
os.remove(os.path.join(old_versions_folder, "login"))
|
287 |
+
else:
|
288 |
+
logger.info(f"Successfully downloaded {file_name}")
|
289 |
+
break
|
290 |
|
291 |
+
if login_detected_count >= 5:
|
292 |
+
dummy_file_name = f"{file_name}.download_failed"
|
293 |
+
dummy_file_path = os.path.join(old_versions_folder, dummy_file_name)
|
294 |
+
try:
|
295 |
+
with open(dummy_file_path, "w") as f:
|
296 |
+
f.write("Download failed after 5 attempts.")
|
297 |
+
logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
|
298 |
+
except Exception as e:
|
299 |
+
logger.error(f"Failed to create dummy file for {file_name}: {e}")
|
300 |
+
continue
|
301 |
|
302 |
+
# ===== 旧コード: 直接アップロードしていた箇所を削除して、rclone暗号化&アップロードに変更する場合は呼び出さない =====
|
303 |
+
# 旧来は self.upload_file(...) していたが、このタイミングでアップロードしたくない場合は消すかコメントアウト
|
304 |
+
# self.upload_file(local_file_path, path_in_repo=...)
|
305 |
+
# os.remove(local_file_path)
|
|
|
306 |
|
307 |
+
def download_images(self, model_versions: list, folder: str):
|
308 |
+
"""モデルの画像をダウンロードし、指定されたフォルダに保存する。"""
|
309 |
+
images_folder = os.path.join(folder, "images")
|
310 |
+
os.makedirs(images_folder, exist_ok=True)
|
311 |
|
312 |
+
images = []
|
313 |
+
for version in model_versions:
|
314 |
+
for img in version.get("images", []):
|
315 |
+
image_url = img["url"]
|
316 |
+
images.append(image_url)
|
317 |
+
|
318 |
+
for image_url in images:
|
319 |
+
image_name = image_url.split("/")[-1]
|
320 |
+
try:
|
321 |
+
response = requests.get(image_url)
|
322 |
+
response.raise_for_status()
|
323 |
+
with open(os.path.join(images_folder, f"{image_name}.png"), "wb") as file:
|
324 |
+
file.write(response.content)
|
325 |
+
except requests.RequestException as e:
|
326 |
+
logger.error(f"Error downloading image {image_url}: {e}")
|
327 |
+
|
328 |
+
# 画像フォルダをパスワード付きZIP
|
329 |
try:
|
330 |
+
original_cwd = os.getcwd()
|
331 |
+
os.chdir(folder)
|
332 |
+
subprocess.run(['zip', '-e', '--password=osanpo', 'images.zip', '-r', 'images'], check=True)
|
333 |
+
logger.info(f"Images compressed and saved to {os.path.join(folder, 'images.zip')}")
|
334 |
except subprocess.CalledProcessError as e:
|
335 |
+
logger.error(f"Error creating zip file: {e}")
|
336 |
+
finally:
|
337 |
+
os.chdir(original_cwd)
|
338 |
+
|
339 |
+
if os.path.exists(images_folder):
|
340 |
+
shutil.rmtree(images_folder)
|
341 |
+
|
342 |
+
def save_html_content(self, url: str, folder: str):
|
343 |
+
"""指定されたURLからHTMLコンテンツを取得し、保存する。"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
try:
|
345 |
+
response = requests.get(url)
|
346 |
+
response.raise_for_status()
|
347 |
+
html_path = os.path.join(folder, f"{folder}.html")
|
348 |
+
with open(html_path, 'w', encoding='utf-8') as file:
|
349 |
+
file.write(response.text)
|
350 |
except Exception as e:
|
351 |
+
logger.error(f"Error saving HTML content for URL {url}: {e}")
|
352 |
+
|
353 |
+
@staticmethod
|
354 |
+
def save_model_info(model_info: dict, folder: str):
|
355 |
+
"""モデル情報(json)の保存"""
|
356 |
+
with open(os.path.join(folder, "model_info.json"), "w") as file:
|
357 |
+
json.dump(model_info, file, indent=2)
|
358 |
+
|
359 |
+
@staticmethod
|
360 |
+
def increment_repo_name(repo_id: str) -> str:
|
361 |
+
"""リポジトリ名の末尾の数字をインクリメントする。"""
|
362 |
+
match = re.search(r'(\d+)$', repo_id)
|
363 |
+
if match:
|
364 |
+
number = int(match.group(1)) + 1
|
365 |
+
new_repo_id = re.sub(r'\d+$', str(number), repo_id)
|
366 |
+
else:
|
367 |
+
new_repo_id = f"{repo_id}1"
|
368 |
+
return new_repo_id
|
369 |
|
370 |
+
# =============================
|
371 |
+
# ここを rclone 暗号化&アップロードに書き換え
|
372 |
+
# =============================
|
373 |
+
def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
|
374 |
+
"""
|
375 |
+
1) rcloneで file_path を暗号化
|
376 |
+
2) 暗号化されたファイル(群)を Hugging Face にアップロード
|
377 |
+
"""
|
378 |
+
if repo_id is None:
|
379 |
+
repo_id = self.repo_ids['current']
|
380 |
+
if path_in_repo is None:
|
381 |
+
path_in_repo = os.path.basename(file_path)
|
382 |
|
383 |
+
# 1) rclone copy (ファイル暗号化)
|
384 |
+
self.encrypt_with_rclone(file_path, is_file=True)
|
385 |
|
386 |
+
# 2) 暗号ファイルをアップロード
|
387 |
+
self.upload_encrypted_files(repo_id=repo_id, path_in_repo=path_in_repo)
|
|
|
388 |
|
389 |
+
# 3) 暗号ディレクトリの掃除
|
390 |
+
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
391 |
+
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
|
|
|
|
392 |
|
393 |
+
def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
|
394 |
+
"""
|
395 |
+
1) rcloneで folder_path を暗号化
|
396 |
+
2) 暗号化されたフォルダを Hugging Face にアップロード
|
397 |
+
"""
|
398 |
+
if path_in_repo is None:
|
399 |
+
path_in_repo = os.path.basename(folder_path)
|
|
|
400 |
|
401 |
+
# 1) rclone copy (フォルダ暗号化)
|
402 |
+
self.encrypt_with_rclone(folder_path, is_file=False)
|
403 |
+
|
404 |
+
# 2) 暗号フォルダをアップロード
|
405 |
+
self.upload_encrypted_files(repo_id=self.repo_ids['current'], path_in_repo=path_in_repo)
|
406 |
+
|
407 |
+
# 3) 掃除
|
408 |
+
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
409 |
+
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
410 |
+
|
411 |
+
def read_model_list(self):
|
412 |
+
"""モデルリストを読み込む。"""
|
413 |
+
model_list = {}
|
414 |
try:
|
415 |
+
with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
|
416 |
+
for line in f:
|
417 |
+
line = line.strip()
|
418 |
+
if line:
|
419 |
+
parts = line.split(": ", 1)
|
420 |
+
if len(parts) == 2:
|
421 |
+
modelpage_name, model_hf_url = parts
|
422 |
+
model_list[model_hf_url] = modelpage_name
|
423 |
+
return model_list
|
424 |
except Exception as e:
|
425 |
+
logger.error(f"Failed to read model list: {e}")
|
426 |
return {}
|
427 |
|
428 |
+
def get_repo_info(self, repo_id):
|
429 |
+
"""リポジトリの情報を取得する。"""
|
430 |
+
try:
|
431 |
+
repo_info = self.api.repo_info(repo_id=repo_id, files_metadata=True)
|
432 |
+
file_paths = [sibling.rfilename for sibling in repo_info.siblings]
|
433 |
+
return file_paths
|
434 |
+
except Exception as e:
|
435 |
+
logger.error(f"Failed to get repo info for {repo_id}: {e}")
|
436 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
+
def process_model(self, model_url: str):
|
439 |
+
"""指定されたモデルURLを処理する関数。"""
|
|
|
|
|
440 |
try:
|
441 |
+
model_id = model_url.rstrip("/").split("/")[-1]
|
442 |
+
model_info = self.get_model_info(model_id)
|
443 |
+
|
444 |
+
latest_version = model_info.get("modelVersions", [])[0]
|
445 |
+
model_file = next(
|
446 |
+
(file for file in latest_version["files"] if file.get('type') == 'Model'),
|
447 |
+
None
|
448 |
+
)
|
449 |
+
if model_file:
|
450 |
+
latest_filename = model_file['name']
|
451 |
+
folder = os.path.splitext(latest_filename)[0]
|
452 |
+
else:
|
453 |
+
first_file = latest_version["files"][0]
|
454 |
+
latest_filename = first_file['name']
|
455 |
+
folder = os.path.splitext(latest_filename)[0]
|
456 |
+
logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
|
457 |
+
|
458 |
+
os.makedirs(folder, exist_ok=True)
|
459 |
+
model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{folder}"
|
460 |
+
model_list = self.read_model_list()
|
461 |
+
|
462 |
+
if model_hf_url in model_list:
|
463 |
+
repo_id = self.repo_ids['current']
|
464 |
+
repo_files = self.get_repo_info(repo_id)
|
465 |
+
old_versions_files = [f for f in repo_files if f.startswith(f"{folder}/old_versions/")]
|
466 |
+
existing_old_version_files = [os.path.basename(f) for f in old_versions_files]
|
467 |
+
else:
|
468 |
+
existing_old_version_files = []
|
469 |
+
|
470 |
+
self.download_model(model_info["modelVersions"], folder, existing_old_version_files)
|
471 |
+
self.download_images(model_info["modelVersions"], folder)
|
472 |
+
self.save_html_content(model_url, folder)
|
473 |
+
self.save_model_info(model_info, folder)
|
474 |
+
|
475 |
+
# ====== rclone でフォルダ暗号化 → HFへアップロード ======
|
476 |
+
self.upload_folder(folder)
|
477 |
+
|
478 |
+
# モデルリスト更新
|
479 |
+
modelpage_name = model_info.get("name", "Unnamed Model")
|
480 |
+
model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{folder}"
|
481 |
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
|
482 |
+
f.write(f"{modelpage_name}: {model_hf_url}\n")
|
|
|
|
|
|
|
483 |
|
484 |
+
# ローカルフォルダを削除
|
485 |
+
if os.path.exists(folder):
|
486 |
+
shutil.rmtree(folder)
|
487 |
+
|
488 |
+
except Exception as e:
|
489 |
+
logger.error(f"Unexpected error processing model ({model_url}): {e}")
|
490 |
|
491 |
async def crawl(self):
|
492 |
+
"""モデルを定期的にチェックし、更新を行う。"""
|
493 |
while True:
|
494 |
try:
|
495 |
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
|
496 |
|
497 |
+
# model_list.logのダウンロード
|
498 |
+
model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
|
499 |
shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
|
500 |
|
501 |
+
# ログファイルのダウンロード
|
502 |
+
local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
|
503 |
+
shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
|
504 |
+
|
505 |
+
# ログ読み込み
|
506 |
+
with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
|
507 |
+
lines = file.read().splitlines()
|
508 |
+
old_models = json.loads(lines[0]) if len(lines) > 0 else []
|
509 |
+
self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
|
510 |
+
|
511 |
+
# 新着モデルの取得
|
512 |
+
response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
|
513 |
+
response.raise_for_status()
|
514 |
+
latest_models = response.json().get("items", [])
|
515 |
+
latest_model_ids = [item.get("id") for item in latest_models if "id" in item]
|
516 |
+
|
517 |
+
# 増分の確認
|
518 |
+
new_models = list(set(latest_model_ids) - set(old_models))
|
519 |
+
|
520 |
+
if new_models:
|
521 |
+
logger.info(f"New models found: {new_models}")
|
522 |
+
model_id = new_models[0]
|
523 |
+
for attempt in range(1, 6):
|
524 |
try:
|
525 |
+
self.process_model(f"{self.config.URLS['modelId']}{model_id}")
|
526 |
break
|
527 |
except Exception as e:
|
528 |
+
logger.error(f"Failed to process model ID {model_id} (Attempt {attempt}/5): {e}")
|
529 |
+
if attempt == 5:
|
530 |
+
logger.error(f"Skipping model ID {model_id} after 5 failed attempts.")
|
531 |
else:
|
532 |
await asyncio.sleep(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
533 |
else:
|
534 |
+
# ログファイルを最新のモデルIDで上書き
|
535 |
+
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
|
536 |
+
f.write(json.dumps(latest_model_ids) + "\n")
|
537 |
+
f.write(f"{self.repo_ids['current']}\n")
|
538 |
+
logger.info(f"Updated log file: {self.config.LOG_FILE}")
|
539 |
+
|
540 |
+
# ログファイルをリポジトリにアップロード
|
541 |
+
self.upload_file(
|
542 |
+
file_path=self.config.LOG_FILE,
|
543 |
+
repo_id=self.repo_ids["log"],
|
544 |
+
path_in_repo=self.config.LOG_FILE
|
545 |
+
)
|
546 |
+
logger.info("Uploaded log file to repository.")
|
547 |
+
|
548 |
+
logger.info("No new models found.")
|
549 |
await asyncio.sleep(60)
|
550 |
continue
|
551 |
+
|
552 |
+
# 古いモデルリストに追加
|
553 |
+
old_models.append(model_id)
|
554 |
+
|
555 |
+
# ログファイルの更新
|
556 |
+
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
|
557 |
+
f.write(json.dumps(old_models) + "\n")
|
558 |
+
f.write(f"{self.repo_ids['current']}\n")
|
559 |
+
logger.info(f"Updated log file with new model ID: {model_id}")
|
560 |
+
|
561 |
+
# ログとモデルリストのアップロード (rcloneは使わず直接uploadでもOKならそのまま)
|
562 |
+
self.upload_file(
|
563 |
+
file_path=self.config.LOG_FILE,
|
564 |
+
repo_id=self.repo_ids["log"],
|
565 |
+
path_in_repo=self.config.LOG_FILE
|
566 |
+
)
|
567 |
+
self.upload_file(
|
568 |
+
file_path=self.config.LIST_FILE,
|
569 |
+
repo_id=self.repo_ids["model_list"],
|
570 |
+
path_in_repo=self.config.LIST_FILE
|
571 |
+
)
|
572 |
+
|
573 |
except Exception as e:
|
574 |
+
logger.error(f"Error during crawling: {e}")
|
575 |
await asyncio.sleep(300)
|
576 |
|
577 |
|
578 |
+
# モジュールレベルでFastAPIのアプリケーションを公開
|
579 |
config = Config()
|
580 |
crawler = CivitAICrawler(config)
|
581 |
app = crawler.app
|