Update main.py
Browse files
main.py
CHANGED
@@ -46,7 +46,7 @@ class Config:
|
|
46 |
"Content-Type": "application/json"
|
47 |
}
|
48 |
|
49 |
-
#
|
50 |
RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
|
51 |
ENCRYPTED_DIR = "/home/user/app/encrypted"
|
52 |
|
@@ -61,9 +61,8 @@ class CivitAICrawler:
|
|
61 |
self.repo_ids = self.config.REPO_IDS.copy()
|
62 |
self.jst = self.config.JST
|
63 |
|
64 |
-
# rclone
|
65 |
self.setup_rclone_conf()
|
66 |
-
|
67 |
self.setup_routes()
|
68 |
|
69 |
def setup_routes(self):
|
@@ -83,7 +82,7 @@ class CivitAICrawler:
|
|
83 |
asyncio.create_task(self.crawl())
|
84 |
|
85 |
# ============================================================================
|
86 |
-
# rclone設定 &
|
87 |
# ============================================================================
|
88 |
def setup_rclone_conf(self):
|
89 |
if not self.config.RCLONE_CONF_BASE64:
|
@@ -97,10 +96,10 @@ class CivitAICrawler:
|
|
97 |
logger.info(f"[INFO] rclone.conf created at: {conf_path}")
|
98 |
|
99 |
def encrypt_with_rclone(self, local_path: str):
|
100 |
-
"""
|
101 |
if not os.path.exists(local_path):
|
102 |
raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
|
103 |
-
#
|
104 |
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
105 |
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
106 |
|
@@ -116,13 +115,14 @@ class CivitAICrawler:
|
|
116 |
)
|
117 |
|
118 |
def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
|
119 |
-
"""self.config.ENCRYPTED_DIR
|
120 |
max_retries = 5
|
121 |
for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
|
122 |
for fn in files:
|
123 |
encrypted_file_path = os.path.join(root, fn)
|
124 |
if not os.path.isfile(encrypted_file_path):
|
125 |
continue
|
|
|
126 |
relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
|
127 |
upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
|
128 |
|
@@ -139,7 +139,7 @@ class CivitAICrawler:
|
|
139 |
except Exception as e:
|
140 |
attempt += 1
|
141 |
error_message = str(e)
|
142 |
-
# 429 Rate-limit
|
143 |
if "rate-limited" in error_message and "minutes" in error_message:
|
144 |
import re
|
145 |
match = re.search(r"in (\d+) minutes?", error_message)
|
@@ -149,13 +149,13 @@ class CivitAICrawler:
|
|
149 |
time.sleep(minutes * 60)
|
150 |
attempt -= 1
|
151 |
continue
|
152 |
-
# 1
|
153 |
if "you can retry this action in about 1 hour" in error_message:
|
154 |
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
|
155 |
time.sleep(3600)
|
156 |
attempt -= 1
|
157 |
continue
|
158 |
-
#
|
159 |
if "over the limit of 100000 files" in error_message:
|
160 |
logger.warning("Repository file limit exceeded. Creating a new repository...")
|
161 |
self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
|
@@ -169,50 +169,52 @@ class CivitAICrawler:
|
|
169 |
f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
|
170 |
)
|
171 |
else:
|
172 |
-
logger.error(
|
173 |
-
f"Failed to upload after {max_retries} attempts: {encrypted_file_path}"
|
174 |
-
)
|
175 |
raise
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
185 |
|
186 |
# ============================================================================
|
187 |
-
#
|
188 |
# ============================================================================
|
189 |
-
def upload_file_encrypted_one_by_one(
|
190 |
-
self,
|
191 |
-
local_path: str,
|
192 |
-
repo_id: Optional[str] = None,
|
193 |
-
path_in_repo: str = ""
|
194 |
-
):
|
195 |
"""
|
196 |
-
|
197 |
"""
|
198 |
if not repo_id:
|
199 |
repo_id = self.repo_ids['current']
|
200 |
|
201 |
-
self.encrypt_with_rclone(
|
202 |
-
self.upload_encrypted_files(repo_id
|
203 |
|
204 |
-
#
|
205 |
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
206 |
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
|
|
|
|
213 |
|
214 |
# ============================================================================
|
215 |
-
#
|
216 |
# ============================================================================
|
217 |
def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
|
218 |
if repo_id is None:
|
@@ -242,7 +244,7 @@ class CivitAICrawler:
|
|
242 |
repo_id = self.repo_ids['current']
|
243 |
continue
|
244 |
elif "you can retry this action in about 1 hour" in error_message:
|
245 |
-
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
|
246 |
time.sleep(3600)
|
247 |
attempt -= 1
|
248 |
else:
|
@@ -253,7 +255,7 @@ class CivitAICrawler:
|
|
253 |
raise
|
254 |
|
255 |
# ============================================================================
|
256 |
-
#
|
257 |
# ============================================================================
|
258 |
@staticmethod
|
259 |
def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
|
@@ -280,47 +282,67 @@ class CivitAICrawler:
|
|
280 |
file.write(chunk)
|
281 |
|
282 |
logger.info(f"Download completed: {file_path}")
|
283 |
-
return file_path
|
284 |
|
285 |
# ============================================================================
|
286 |
-
#
|
287 |
# ============================================================================
|
288 |
-
def
|
289 |
-
"""
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
download_url = file_info["downloadUrl"]
|
296 |
file_name = file_info["name"]
|
|
|
|
|
|
|
|
|
|
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
logger.warning(f"Skip because file not found locally: {local_path}")
|
302 |
-
continue
|
303 |
-
|
304 |
-
# 暗号化アップロード
|
305 |
-
# 例: "myModelName/filename"
|
306 |
-
in_repo_path = os.path.join(encrypted_folder_name, file_name)
|
307 |
-
self.upload_file_encrypted_one_by_one(local_path, repo_id=self.repo_ids['current'], path_in_repo=in_repo_path)
|
308 |
|
309 |
-
def process_images_one_by_one(self, version_list: list, model_folder: str, encrypted_folder_name: str):
|
310 |
-
"""
|
311 |
-
画像をすべて1つずつDL→暗号化アップロード→削除
|
312 |
-
path_in_repo は "{encrypted_folder_name}/images/"
|
313 |
-
"""
|
314 |
images = []
|
315 |
-
for version in
|
316 |
-
for
|
317 |
-
images.append(
|
318 |
|
319 |
for image_url in images:
|
320 |
-
image_name =
|
321 |
-
local_path = os.path.join(
|
322 |
-
|
323 |
-
# ダウンロード
|
324 |
try:
|
325 |
resp = requests.get(image_url, stream=True)
|
326 |
resp.raise_for_status()
|
@@ -330,46 +352,7 @@ class CivitAICrawler:
|
|
330 |
logger.info(f"Downloaded image: {local_path}")
|
331 |
except Exception as e:
|
332 |
logger.error(f"Error downloading image {image_url}: {e}")
|
333 |
-
continue
|
334 |
|
335 |
-
# アップロード
|
336 |
-
in_repo_path = os.path.join(encrypted_folder_name, "images", image_name)
|
337 |
-
self.upload_file_encrypted_one_by_one(local_path, self.repo_ids['current'], in_repo_path)
|
338 |
-
|
339 |
-
def process_old_versions_one_by_one(self, version_list: list, model_folder: str, encrypted_folder_name: str):
|
340 |
-
"""
|
341 |
-
古いバージョン (index=1以降) のファイルを 1つずつダウンロード→暗号化アップロード→削除
|
342 |
-
path_in_repo は "{encrypted_folder_name}/old_versions/{versionID_orName}/filename"
|
343 |
-
"""
|
344 |
-
if len(version_list) <= 1:
|
345 |
-
return
|
346 |
-
|
347 |
-
for old_version in version_list[1:]:
|
348 |
-
# どんな名前でフォルダを区別するか(バージョンIDやバージョン名など)
|
349 |
-
version_id_or_name = str(old_version.get("id", "old_ver"))
|
350 |
-
files = old_version.get("files", [])
|
351 |
-
for file_info in files:
|
352 |
-
download_url = file_info["downloadUrl"]
|
353 |
-
file_name = file_info["name"]
|
354 |
-
|
355 |
-
# ダウンロード
|
356 |
-
local_path = self.download_file(download_url, model_folder, file_name)
|
357 |
-
if not local_path or not os.path.exists(local_path):
|
358 |
-
logger.warning(f"Skip because file not found locally: {local_path}")
|
359 |
-
continue
|
360 |
-
|
361 |
-
# 暗号化アップロード
|
362 |
-
in_repo_path = os.path.join(
|
363 |
-
encrypted_folder_name,
|
364 |
-
"old_versions",
|
365 |
-
version_id_or_name,
|
366 |
-
file_name
|
367 |
-
)
|
368 |
-
self.upload_file_encrypted_one_by_one(local_path, self.repo_ids['current'], in_repo_path)
|
369 |
-
|
370 |
-
# ============================================================================
|
371 |
-
# HTML & model_info.json は軽量なので一括DL→アップロードでもOK
|
372 |
-
# ============================================================================
|
373 |
def save_html_content(self, url: str, folder: str):
|
374 |
try:
|
375 |
response = requests.get(url)
|
@@ -377,20 +360,13 @@ class CivitAICrawler:
|
|
377 |
html_path = os.path.join(folder, os.path.basename(folder) + ".html")
|
378 |
with open(html_path, 'w', encoding='utf-8') as file:
|
379 |
file.write(response.text)
|
380 |
-
return html_path
|
381 |
except Exception as e:
|
382 |
logger.error(f"Error saving HTML content for URL {url}: {e}")
|
383 |
-
return None
|
384 |
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
json.dump(model_info, file, indent=2)
|
390 |
-
return json_path
|
391 |
-
except Exception as e:
|
392 |
-
logger.error(f"Error saving model_info.json: {e}")
|
393 |
-
return None
|
394 |
|
395 |
# ============================================================================
|
396 |
# model_list.log
|
@@ -411,22 +387,23 @@ class CivitAICrawler:
|
|
411 |
logger.error(f"Failed to read model list: {e}")
|
412 |
return model_list
|
413 |
|
|
|
|
|
|
|
414 |
def get_model_info(self, model_id: str) -> dict:
|
415 |
-
"""
|
416 |
-
model_id(例: '1110807')に対応するモデル情報を
|
417 |
-
CivitAIのAPIから取得し、jsonを返す
|
418 |
-
"""
|
419 |
try:
|
420 |
url = self.config.URLS["modelId"] + str(model_id)
|
421 |
-
|
422 |
-
|
423 |
-
return
|
424 |
except requests.RequestException as e:
|
425 |
logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
|
426 |
-
return {}
|
427 |
-
|
|
|
|
|
|
|
428 |
def process_model(self, model_url: str):
|
429 |
-
""" 指定されたモデルURLを処理 (1つずつファイルをDL→アップロード→削除) """
|
430 |
try:
|
431 |
model_id = model_url.rstrip("/").split("/")[-1]
|
432 |
model_info = self.get_model_info(model_id)
|
@@ -434,103 +411,99 @@ class CivitAICrawler:
|
|
434 |
logger.error(f"No model_info returned for {model_id}")
|
435 |
return
|
436 |
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
|
|
|
|
442 |
if model_file:
|
443 |
-
latest_filename = model_file[
|
444 |
folder = os.path.splitext(latest_filename)[0]
|
445 |
else:
|
446 |
-
# ファイルtype=Modelが無い場合、とりあえず最初のファイル��でフォルダ名を作る
|
447 |
first_file = latest_version["files"][0]
|
448 |
-
latest_filename = first_file[
|
449 |
folder = os.path.splitext(latest_filename)[0]
|
450 |
-
logger.warning(f"No 'Model' type file found for {model_id}. Using first file's name.")
|
451 |
|
452 |
-
# ローカルに一時フォルダを作成
|
453 |
os.makedirs(folder, exist_ok=True)
|
454 |
|
455 |
-
#
|
456 |
model_list = self.read_model_list()
|
457 |
modelpage_name = model_info.get("name", f"Model_{model_id}")
|
458 |
if modelpage_name in model_list.values():
|
459 |
logger.info(f"Model '{modelpage_name}' already in model_list. Skipping.")
|
460 |
-
# return
|
461 |
|
462 |
-
#
|
463 |
-
|
464 |
-
json_path = self.save_model_info(model_info, folder)
|
465 |
|
466 |
-
#
|
467 |
-
|
468 |
-
if html_path and os.path.exists(html_path):
|
469 |
-
in_repo_path = os.path.join(folder, os.path.basename(html_path))
|
470 |
-
self.upload_file_encrypted_one_by_one(html_path, self.repo_ids['current'], in_repo_path)
|
471 |
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
|
476 |
-
#
|
477 |
-
self.
|
478 |
|
479 |
-
#
|
480 |
-
|
481 |
|
482 |
-
#
|
483 |
-
|
|
|
484 |
|
485 |
-
#
|
486 |
if os.path.exists(folder):
|
487 |
shutil.rmtree(folder)
|
488 |
|
489 |
-
#
|
490 |
-
#
|
491 |
-
#
|
492 |
-
|
493 |
-
model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{folder}"
|
494 |
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
|
495 |
f.write(f"{modelpage_name}: {model_hf_url}\n")
|
496 |
|
497 |
except Exception as e:
|
498 |
logger.error(f"Unexpected error processing model ({model_url}): {e}")
|
499 |
|
500 |
-
|
|
|
|
|
501 |
async def crawl(self):
|
502 |
-
"""新着モデルをチェックし、1��ずつ処理するループ"""
|
503 |
while True:
|
504 |
try:
|
505 |
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
|
506 |
|
507 |
-
# 最新のmodel_list.log & civitai_backup.log をダウンロード
|
508 |
model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
|
509 |
shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
|
510 |
|
511 |
local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
|
512 |
shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
|
513 |
|
514 |
-
#
|
515 |
with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
|
516 |
lines = file.read().splitlines()
|
517 |
old_models = json.loads(lines[0]) if len(lines) > 0 else []
|
518 |
self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
|
519 |
|
520 |
-
#
|
521 |
response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
|
522 |
response.raise_for_status()
|
523 |
latest_models = response.json().get("items", [])
|
524 |
-
latest_model_ids = [
|
525 |
|
526 |
-
#
|
527 |
new_models = list(set(latest_model_ids) - set(old_models))
|
528 |
|
529 |
if new_models:
|
530 |
logger.info(f"New models found: {new_models}")
|
531 |
model_id = new_models[0]
|
532 |
|
533 |
-
# 試行5回
|
534 |
for attempt in range(1, 6):
|
535 |
try:
|
536 |
self.process_model(f"{self.config.URLS['modelId']}{model_id}")
|
@@ -542,7 +515,7 @@ class CivitAICrawler:
|
|
542 |
else:
|
543 |
await asyncio.sleep(2)
|
544 |
else:
|
545 |
-
# 新モデルなし
|
546 |
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
|
547 |
f.write(json.dumps(latest_model_ids) + "\n")
|
548 |
f.write(f"{self.repo_ids['current']}\n")
|
@@ -555,14 +528,14 @@ class CivitAICrawler:
|
|
555 |
await asyncio.sleep(60)
|
556 |
continue
|
557 |
|
558 |
-
#
|
559 |
old_models.append(model_id)
|
560 |
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
|
561 |
f.write(json.dumps(old_models) + "\n")
|
562 |
f.write(f"{self.repo_ids['current']}\n")
|
563 |
logger.info(f"Updated log file with new model ID: {model_id}")
|
564 |
|
565 |
-
#
|
566 |
self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
|
567 |
self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
|
568 |
|
|
|
46 |
"Content-Type": "application/json"
|
47 |
}
|
48 |
|
49 |
+
# rclone 用の追加設定
|
50 |
RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
|
51 |
ENCRYPTED_DIR = "/home/user/app/encrypted"
|
52 |
|
|
|
61 |
self.repo_ids = self.config.REPO_IDS.copy()
|
62 |
self.jst = self.config.JST
|
63 |
|
64 |
+
# rclone 設定の読み込み
|
65 |
self.setup_rclone_conf()
|
|
|
66 |
self.setup_routes()
|
67 |
|
68 |
def setup_routes(self):
|
|
|
82 |
asyncio.create_task(self.crawl())
|
83 |
|
84 |
# ============================================================================
|
85 |
+
# rclone 設定 & 暗号化アップロード処理
|
86 |
# ============================================================================
|
87 |
def setup_rclone_conf(self):
|
88 |
if not self.config.RCLONE_CONF_BASE64:
|
|
|
96 |
logger.info(f"[INFO] rclone.conf created at: {conf_path}")
|
97 |
|
98 |
def encrypt_with_rclone(self, local_path: str):
|
99 |
+
"""フォルダ or ファイルを cryptLocal: にコピーし、フォルダ名・ファイル名を暗号化"""
|
100 |
if not os.path.exists(local_path):
|
101 |
raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
|
102 |
+
# 事前に暗号先ディレクトリを掃除
|
103 |
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
104 |
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
105 |
|
|
|
115 |
)
|
116 |
|
117 |
def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
|
118 |
+
"""self.config.ENCRYPTED_DIR 以下の暗号化済ファイルを再帰的にアップロード"""
|
119 |
max_retries = 5
|
120 |
for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
|
121 |
for fn in files:
|
122 |
encrypted_file_path = os.path.join(root, fn)
|
123 |
if not os.path.isfile(encrypted_file_path):
|
124 |
continue
|
125 |
+
|
126 |
relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
|
127 |
upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
|
128 |
|
|
|
139 |
except Exception as e:
|
140 |
attempt += 1
|
141 |
error_message = str(e)
|
142 |
+
# 429 Rate-limit with "in XX minutes"
|
143 |
if "rate-limited" in error_message and "minutes" in error_message:
|
144 |
import re
|
145 |
match = re.search(r"in (\d+) minutes?", error_message)
|
|
|
149 |
time.sleep(minutes * 60)
|
150 |
attempt -= 1
|
151 |
continue
|
152 |
+
# 1時間待機
|
153 |
if "you can retry this action in about 1 hour" in error_message:
|
154 |
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour...")
|
155 |
time.sleep(3600)
|
156 |
attempt -= 1
|
157 |
continue
|
158 |
+
# ファイル上限
|
159 |
if "over the limit of 100000 files" in error_message:
|
160 |
logger.warning("Repository file limit exceeded. Creating a new repository...")
|
161 |
self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
|
|
|
169 |
f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
|
170 |
)
|
171 |
else:
|
172 |
+
logger.error(f"Failed to upload after {max_retries} attempts: {encrypted_file_path}")
|
|
|
|
|
173 |
raise
|
174 |
|
175 |
+
def upload_folder_encrypted(self, folder_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
|
176 |
+
"""フォルダを丸ごと暗号化してアップロード (=フォルダ名も暗号化)"""
|
177 |
+
if not repo_id:
|
178 |
+
repo_id = self.repo_ids['current']
|
179 |
+
|
180 |
+
self.encrypt_with_rclone(folder_path)
|
181 |
+
self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo)
|
182 |
+
|
183 |
+
# 暗号化フォルダを削除
|
184 |
+
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
185 |
+
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
186 |
|
187 |
# ============================================================================
|
188 |
+
# 単一ファイルを暗号化アップロードしてローカル削除 (old_versions用)
|
189 |
# ============================================================================
|
190 |
+
def upload_file_encrypted_one_by_one(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: str = ""):
|
|
|
|
|
|
|
|
|
|
|
191 |
"""
|
192 |
+
単一ファイルを暗号化アップロードし、アップロード後にローカルファイルを削除。
|
193 |
"""
|
194 |
if not repo_id:
|
195 |
repo_id = self.repo_ids['current']
|
196 |
|
197 |
+
self.encrypt_with_rclone(file_path)
|
198 |
+
self.upload_encrypted_files(repo_id, base_path_in_repo=path_in_repo)
|
199 |
|
200 |
+
# 暗号化ディレクトリを削除
|
201 |
if os.path.isdir(self.config.ENCRYPTED_DIR):
|
202 |
shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
|
203 |
+
# ローカルの実ファイル削除
|
204 |
+
if os.path.exists(file_path):
|
205 |
+
os.remove(file_path)
|
206 |
|
207 |
+
@staticmethod
|
208 |
+
def increment_repo_name(repo_id: str) -> str:
|
209 |
+
match = re.search(r'(\d+)$', repo_id)
|
210 |
+
if match:
|
211 |
+
number = int(match.group(1)) + 1
|
212 |
+
return re.sub(r'\d+$', str(number), repo_id)
|
213 |
+
else:
|
214 |
+
return f"{repo_id}1"
|
215 |
|
216 |
# ============================================================================
|
217 |
+
# ログや model_list.log は生アップロード
|
218 |
# ============================================================================
|
219 |
def upload_file_raw(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
|
220 |
if repo_id is None:
|
|
|
244 |
repo_id = self.repo_ids['current']
|
245 |
continue
|
246 |
elif "you can retry this action in about 1 hour" in error_message:
|
247 |
+
logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
|
248 |
time.sleep(3600)
|
249 |
attempt -= 1
|
250 |
else:
|
|
|
255 |
raise
|
256 |
|
257 |
# ============================================================================
|
258 |
+
# ダウンロード処理
|
259 |
# ============================================================================
|
260 |
@staticmethod
|
261 |
def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
|
|
|
282 |
file.write(chunk)
|
283 |
|
284 |
logger.info(f"Download completed: {file_path}")
|
285 |
+
return file_path
|
286 |
|
287 |
# ============================================================================
|
288 |
+
# 古いバージョンのみ1ファイルずつアップロード
|
289 |
# ============================================================================
|
290 |
+
def download_old_versions_one_by_one(self, version_list: list, folder: str):
|
291 |
+
"""version_list[1:] を対象に、モデルファイルを 1ファイルDL→upload→削除 を繰り返す"""
|
292 |
+
if len(version_list) <= 1:
|
293 |
+
return
|
294 |
+
|
295 |
+
old_versions_folder = os.path.join(folder, "old_versions")
|
296 |
+
os.makedirs(old_versions_folder, exist_ok=True)
|
297 |
+
|
298 |
+
for version in version_list[1:]:
|
299 |
+
for file_info in version.get("files", []):
|
300 |
+
download_url = file_info["downloadUrl"]
|
301 |
+
file_name = file_info["name"]
|
302 |
+
|
303 |
+
local_path = self.download_file(download_url, old_versions_folder, file_name)
|
304 |
+
if not local_path or not os.path.exists(local_path):
|
305 |
+
logger.error(f"Failed to download or file not found: {file_name}")
|
306 |
+
continue
|
307 |
+
|
308 |
+
# 1つアップロードして削除
|
309 |
+
# path_in_repo を空文字にすればフォルダ名も暗号化される(トップレベル)
|
310 |
+
# もしサブフォルダにまとめたいなら "old_versions" とか指定する
|
311 |
+
self.upload_file_encrypted_one_by_one(local_path, path_in_repo="")
|
312 |
+
|
313 |
+
# old_versions フォルダ内は空になったはずなので削除
|
314 |
+
if os.path.exists(old_versions_folder):
|
315 |
+
shutil.rmtree(old_versions_folder, ignore_errors=True)
|
316 |
+
|
317 |
+
# ============================================================================
|
318 |
+
# 従来どおり「最新バージョンのファイル一式 + images」フォルダを一括DL→アップロード
|
319 |
+
# ============================================================================
|
320 |
+
def download_model(self, model_versions: list, folder: str):
|
321 |
+
"""最新バージョンを一括ダウンロード (フォルダにまとめる)"""
|
322 |
+
latest_version = model_versions[0]
|
323 |
+
latest_files = latest_version["files"]
|
324 |
+
|
325 |
+
for file_info in latest_files:
|
326 |
download_url = file_info["downloadUrl"]
|
327 |
file_name = file_info["name"]
|
328 |
+
local_path = self.download_file(download_url, folder, file_name)
|
329 |
+
if local_path and os.path.exists(local_path):
|
330 |
+
logger.info(f"Downloaded {file_name}")
|
331 |
+
else:
|
332 |
+
logger.warning(f"Could not download {file_name}")
|
333 |
|
334 |
+
def download_images(self, model_versions: list, folder: str):
|
335 |
+
images_folder = os.path.join(folder, "images")
|
336 |
+
os.makedirs(images_folder, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
|
|
|
|
|
|
|
|
|
|
338 |
images = []
|
339 |
+
for version in model_versions:
|
340 |
+
for img in version.get("images", []):
|
341 |
+
images.append(img["url"])
|
342 |
|
343 |
for image_url in images:
|
344 |
+
image_name = os.path.basename(image_url) + ".png"
|
345 |
+
local_path = os.path.join(images_folder, image_name)
|
|
|
|
|
346 |
try:
|
347 |
resp = requests.get(image_url, stream=True)
|
348 |
resp.raise_for_status()
|
|
|
352 |
logger.info(f"Downloaded image: {local_path}")
|
353 |
except Exception as e:
|
354 |
logger.error(f"Error downloading image {image_url}: {e}")
|
|
|
355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
def save_html_content(self, url: str, folder: str):
|
357 |
try:
|
358 |
response = requests.get(url)
|
|
|
360 |
html_path = os.path.join(folder, os.path.basename(folder) + ".html")
|
361 |
with open(html_path, 'w', encoding='utf-8') as file:
|
362 |
file.write(response.text)
|
|
|
363 |
except Exception as e:
|
364 |
logger.error(f"Error saving HTML content for URL {url}: {e}")
|
|
|
365 |
|
366 |
+
@staticmethod
|
367 |
+
def save_model_info(model_info: dict, folder: str):
|
368 |
+
with open(os.path.join(folder, "model_info.json"), "w", encoding="utf-8") as file:
|
369 |
+
json.dump(model_info, file, indent=2)
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
# ============================================================================
|
372 |
# model_list.log
|
|
|
387 |
logger.error(f"Failed to read model list: {e}")
|
388 |
return model_list
|
389 |
|
390 |
+
# ============================================================================
|
391 |
+
# model 情報取得
|
392 |
+
# ============================================================================
|
393 |
def get_model_info(self, model_id: str) -> dict:
|
|
|
|
|
|
|
|
|
394 |
try:
|
395 |
url = self.config.URLS["modelId"] + str(model_id)
|
396 |
+
resp = requests.get(url, headers=self.config.HEADERS)
|
397 |
+
resp.raise_for_status()
|
398 |
+
return resp.json()
|
399 |
except requests.RequestException as e:
|
400 |
logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
|
401 |
+
return {}
|
402 |
+
|
403 |
+
# ============================================================================
|
404 |
+
# メイン処理: 最新ファイル + images はフォルダごとアップロード。old_versions は1ファイルずつ。
|
405 |
+
# ============================================================================
|
406 |
def process_model(self, model_url: str):
|
|
|
407 |
try:
|
408 |
model_id = model_url.rstrip("/").split("/")[-1]
|
409 |
model_info = self.get_model_info(model_id)
|
|
|
411 |
logger.error(f"No model_info returned for {model_id}")
|
412 |
return
|
413 |
|
414 |
+
model_versions = model_info.get("modelVersions", [])
|
415 |
+
if not model_versions:
|
416 |
+
logger.error(f"No modelVersions in model info {model_id}")
|
417 |
+
return
|
418 |
+
|
419 |
+
latest_version = model_versions[0]
|
420 |
+
model_file = next((file for file in latest_version["files"] if file.get('type') == 'Model'), None)
|
421 |
if model_file:
|
422 |
+
latest_filename = model_file['name']
|
423 |
folder = os.path.splitext(latest_filename)[0]
|
424 |
else:
|
|
|
425 |
first_file = latest_version["files"][0]
|
426 |
+
latest_filename = first_file['name']
|
427 |
folder = os.path.splitext(latest_filename)[0]
|
428 |
+
logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
|
429 |
|
|
|
430 |
os.makedirs(folder, exist_ok=True)
|
431 |
|
432 |
+
# すでにアップ済みかどうか model_list.log でチェック (モデル名ベース)
|
433 |
model_list = self.read_model_list()
|
434 |
modelpage_name = model_info.get("name", f"Model_{model_id}")
|
435 |
if modelpage_name in model_list.values():
|
436 |
logger.info(f"Model '{modelpage_name}' already in model_list. Skipping.")
|
437 |
+
# 必要ならreturn
|
438 |
|
439 |
+
# 最新バージョン (まとめてダウンロード)
|
440 |
+
self.download_model(model_versions, folder)
|
|
|
441 |
|
442 |
+
# 画像 (imagesフォルダまるごとダウンロード)
|
443 |
+
self.download_images(model_versions, folder)
|
|
|
|
|
|
|
444 |
|
445 |
+
# HTML & model_info.json
|
446 |
+
self.save_html_content(self.config.URLS["modelPage"] + str(model_id), folder)
|
447 |
+
self.save_model_info(model_info, folder)
|
448 |
|
449 |
+
# 古いバージョンのみ「1つずつアップロード&削除」
|
450 |
+
self.download_old_versions_one_by_one(model_versions, folder)
|
451 |
|
452 |
+
# ↑で old_versions は空になった → あとはフォルダに残っているのは
|
453 |
+
# 最新バージョンファイル・imagesフォルダ・model_info.json・HTML など
|
454 |
|
455 |
+
# "folder" 自体を暗号化アップロード (= images フォルダごとアップロード)
|
456 |
+
# path_in_repo を "" にすればフォルダ名も暗号化される
|
457 |
+
self.upload_folder_encrypted(folder, path_in_repo="")
|
458 |
|
459 |
+
# ローカルフォルダ削除
|
460 |
if os.path.exists(folder):
|
461 |
shutil.rmtree(folder)
|
462 |
|
463 |
+
# model_list.log に追記
|
464 |
+
# HF上では folder名 も暗号化されるが、ここでは元の "modelpage_name" と
|
465 |
+
# HFへのトップフォルダ参照URLを書く
|
466 |
+
model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main"
|
|
|
467 |
with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
|
468 |
f.write(f"{modelpage_name}: {model_hf_url}\n")
|
469 |
|
470 |
except Exception as e:
|
471 |
logger.error(f"Unexpected error processing model ({model_url}): {e}")
|
472 |
|
473 |
+
# ============================================================================
|
474 |
+
# crawl
|
475 |
+
# ============================================================================
|
476 |
async def crawl(self):
|
|
|
477 |
while True:
|
478 |
try:
|
479 |
login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
|
480 |
|
481 |
+
# 最新の model_list.log & civitai_backup.log をダウンロード
|
482 |
model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
|
483 |
shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
|
484 |
|
485 |
local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
|
486 |
shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
|
487 |
|
488 |
+
# ログ読み込み
|
489 |
with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
|
490 |
lines = file.read().splitlines()
|
491 |
old_models = json.loads(lines[0]) if len(lines) > 0 else []
|
492 |
self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
|
493 |
|
494 |
+
# 新着モデル確認
|
495 |
response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
|
496 |
response.raise_for_status()
|
497 |
latest_models = response.json().get("items", [])
|
498 |
+
latest_model_ids = [m["id"] for m in latest_models if "id" in m]
|
499 |
|
500 |
+
# 差集合
|
501 |
new_models = list(set(latest_model_ids) - set(old_models))
|
502 |
|
503 |
if new_models:
|
504 |
logger.info(f"New models found: {new_models}")
|
505 |
model_id = new_models[0]
|
506 |
|
|
|
507 |
for attempt in range(1, 6):
|
508 |
try:
|
509 |
self.process_model(f"{self.config.URLS['modelId']}{model_id}")
|
|
|
515 |
else:
|
516 |
await asyncio.sleep(2)
|
517 |
else:
|
518 |
+
# 新モデルなし
|
519 |
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
|
520 |
f.write(json.dumps(latest_model_ids) + "\n")
|
521 |
f.write(f"{self.repo_ids['current']}\n")
|
|
|
528 |
await asyncio.sleep(60)
|
529 |
continue
|
530 |
|
531 |
+
# 1件アップロードに成功したら old_models に追加
|
532 |
old_models.append(model_id)
|
533 |
with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
|
534 |
f.write(json.dumps(old_models) + "\n")
|
535 |
f.write(f"{self.repo_ids['current']}\n")
|
536 |
logger.info(f"Updated log file with new model ID: {model_id}")
|
537 |
|
538 |
+
# ログと model_list.log をアップ
|
539 |
self.upload_file_raw(self.config.LOG_FILE, self.repo_ids["log"], self.config.LOG_FILE)
|
540 |
self.upload_file_raw(self.config.LIST_FILE, self.repo_ids["model_list"], self.config.LIST_FILE)
|
541 |
|