ttttdiva commited on
Commit
8b6ea4e
·
verified ·
1 Parent(s): 28aa8ba

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +674 -669
main.py CHANGED
@@ -1,669 +1,674 @@
1
- import asyncio
2
- import base64
3
- import datetime
4
- import json
5
- import logging
6
- import os
7
- import re
8
- import shutil
9
- import subprocess
10
- import time
11
- from typing import Optional
12
-
13
- import requests
14
- from bs4 import BeautifulSoup
15
- from fake_useragent import UserAgent
16
- from fastapi import FastAPI
17
- from huggingface_hub import HfApi, create_repo, hf_hub_download, login
18
-
19
- # ロギングの設定
20
- logging.basicConfig(level=logging.INFO)
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- class Config:
25
- """設定用のクラス"""
26
- HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
27
- CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
28
- LOG_FILE = "civitai_backup.log"
29
- LIST_FILE = "model_list.log"
30
- REPO_IDS = {
31
- "log": "ttttdiva/CivitAI_log_test",
32
- "model_list": "ttttdiva/CivitAI_model_info_test",
33
- "current": ""
34
- }
35
- URLS = {
36
- "latest": "https://civitai.com/api/v1/models?sort=Newest",
37
- "modelPage": "https://civitai.com/models/",
38
- "modelId": "https://civitai.com/api/v1/models/",
39
- "modelVersionId": "https://civitai.com/api/v1/model-versions/",
40
- "hash": "https://civitai.com/api/v1/model-versions/by-hash/"
41
- }
42
- JST = datetime.timezone(datetime.timedelta(hours=9))
43
- UA = UserAgent()
44
- HEADERS = {
45
- 'Authorization': f'Bearer {CIVITAI_API_TOKEN}',
46
- 'User-Agent': 'civitai-crawler/1.0',
47
- "Content-Type": "application/json"
48
- }
49
-
50
- # ===== rclone 用の追加設定 =====
51
- RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
52
- # 暗号化されたファイルが出力されるローカルディレクトリ(cryptLocal: の実体)
53
- ENCRYPTED_DIR = "/home/user/app/encrypted"
54
-
55
-
56
- class CivitAICrawler:
57
- """CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス"""
58
-
59
- def __init__(self, config: Config):
60
- self.config = config
61
- self.api = HfApi()
62
- self.app = FastAPI()
63
- self.repo_ids = self.config.REPO_IDS.copy()
64
- self.jst = self.config.JST
65
-
66
- # rclone のセットアップ
67
- self.setup_rclone_conf()
68
-
69
- self.setup_routes()
70
-
71
- def setup_routes(self):
72
- """FastAPIのルーティングを設定する。"""
73
- @self.app.get("/")
74
- def read_root():
75
- now = str(datetime.datetime.now(self.jst))
76
- description = f"""
77
- CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。
78
- model_list.log や civitai_backup.log は暗号化しないでアップロードします。
79
- モデルのフォルダやファイルは暗号化してアップロードします。
80
- Status: {now} + currently running :D
81
- """
82
- return description
83
-
84
- @self.app.on_event("startup")
85
- async def startup_event():
86
- asyncio.create_task(self.crawl())
87
-
88
- # =============================================================================
89
- # rclone の設定・暗号化アップロード処理
90
- # =============================================================================
91
- def setup_rclone_conf(self):
92
- """環境変数 RCLONE_CONF_BASE64 から rclone.conf を生成し、RCLONE_CONFIG 環境変数を設定"""
93
- if not self.config.RCLONE_CONF_BASE64:
94
- logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
95
- return
96
-
97
- os.makedirs(".rclone_config", exist_ok=True)
98
- conf_path = os.path.join(".rclone_config", "rclone.conf")
99
- with open(conf_path, "wb") as f:
100
- f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
101
-
102
- os.environ["RCLONE_CONFIG"] = conf_path
103
- logger.info(f"[INFO] rclone.conf created at: {conf_path}")
104
-
105
- def encrypt_with_rclone(self, local_path: str):
106
- """
107
- 指定ファイル or ディレクトリを cryptLocal: にコピー。
108
- フォルダ構造やファイル名を rclone の filename_encryption 設定に応じて暗号化する。
109
- """
110
- if not os.path.exists(local_path):
111
- raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
112
-
113
- # 事前に暗号先ディレクトリをクリーンアップ
114
- if os.path.isdir(self.config.ENCRYPTED_DIR):
115
- shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
116
-
117
- top_level_name = os.path.basename(local_path.rstrip("/"))
118
- if not top_level_name:
119
- top_level_name = "unnamed"
120
-
121
- cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"]
122
- logger.info(f"[INFO] Running: {' '.join(cmd)}")
123
- subprocess.run(cmd, check=True)
124
- logger.info(f"[OK] rclone copy => cryptLocal:{top_level_name}")
125
-
126
- if not os.path.isdir(self.config.ENCRYPTED_DIR):
127
- raise FileNotFoundError(
128
- f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
129
- )
130
-
131
- # 例: upload_encrypted_files の中の再試行処理
132
- def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
133
- max_retries = 5
134
- for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
135
- for fn in files:
136
- encrypted_file_path = os.path.join(root, fn)
137
- if not os.path.isfile(encrypted_file_path):
138
- continue
139
-
140
- relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
141
- upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
142
-
143
- attempt = 0
144
- while attempt < max_retries:
145
- try:
146
- self.api.upload_file(
147
- path_or_fileobj=encrypted_file_path,
148
- repo_id=repo_id,
149
- path_in_repo=upload_path_in_repo
150
- )
151
- logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}")
152
- break
153
-
154
- except Exception as e:
155
- attempt += 1
156
- error_message = str(e)
157
-
158
- # ================================
159
- # 429によるrate-limit検出追加
160
- # ================================
161
- # "You have been rate-limited; you can retry this action in 31 minutes."
162
- # のようなメッセージから時間を抽出し、その時間+1分だけ待機後、再試行
163
- if "rate-limited" in error_message and "minutes" in error_message:
164
- import re
165
- match = re.search(r"in (\d+) minutes?", error_message)
166
- if match:
167
- minutes = int(match.group(1))
168
- # +1分して待機
169
- minutes += 1
170
- logger.warning(f"Rate-limited. Waiting {minutes} minutes before retry...")
171
- time.sleep(minutes * 60)
172
- attempt -= 1 # 同じ attempt カウントで再試行
173
- continue
174
-
175
- # ================================
176
- # すでにある1時間待機処理
177
- # ================================
178
- if "you can retry this action in about 1 hour" in error_message:
179
- logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
180
- time.sleep(3600)
181
- attempt -= 1 # 再試行回数を増やさずにループを続ける
182
- continue
183
-
184
- if "over the limit of 100000 files" in error_message:
185
- logger.warning("Repository file limit exceeded. Creating a new repository...")
186
- self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
187
- self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
188
- attempt = 0
189
- repo_id = self.repo_ids['current']
190
- continue
191
-
192
- # 上記以外のエラーの場合
193
- if attempt < max_retries:
194
- logger.warning(
195
- f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
196
- )
197
- else:
198
- logger.error(
199
- f"Failed to upload after {max_retries} attempts: {encrypted_file_path}"
200
- )
201
- raise
202
-
203
- @staticmethod
204
- def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
205
- if content_disposition:
206
- parts = content_disposition.split(';')
207
- for part in parts:
208
- if "filename=" in part:
209
- return part.split("=")[1].strip().strip('"')
210
- return default_name
211
-
212
- def download_file(self, url: str, destination_folder: str, default_name: str):
213
- try:
214
- response = requests.get(url, headers=self.config.HEADERS, stream=True)
215
- response.raise_for_status()
216
- except requests.RequestException as e:
217
- logger.error(f"Failed to download file from {url}: {e}")
218
- return
219
-
220
- filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
221
- file_path = os.path.join(destination_folder, filename)
222
-
223
- with open(file_path, 'wb') as file:
224
- for chunk in response.iter_content(chunk_size=8192):
225
- file.write(chunk)
226
- logger.info(f"Download completed: {file_path}")
227
-
228
- def get_model_info(self, model_id: str) -> dict:
229
- try:
230
- response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
231
- response.raise_for_status()
232
- return response.json()
233
- except requests.RequestException as e:
234
- logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
235
-
236
- def download_model(self, model_versions: list, folder: str):
237
- """
238
- - 最新バージョンのファイルを「folder」へまとめてダウンロード
239
- - 古いバージョンはまだダウンロードしない (実は後から個別にダウンロード)
240
- (※今回は「まとめて old_versions フォルダに入れずに、1ファイルずつ別のメソッドで対処する」流れに)
241
- """
242
- latest_version = model_versions[0]
243
- latest_files = latest_version["files"]
244
-
245
- for file_info in latest_files:
246
- download_url = file_info["downloadUrl"]
247
- file_name = file_info["name"]
248
- login_detected_count = 0
249
-
250
- while login_detected_count < 5:
251
- try:
252
- self.download_file(download_url, folder, file_name)
253
- except Exception as e:
254
- logger.error(f"Exception occurred while downloading {file_name}: {e}")
255
- login_detected_count += 1
256
- continue
257
-
258
- if "login" in os.listdir(folder):
259
- login_detected_count += 1
260
- logger.warning(f"'login' file found. Will try again. ({login_detected_count}/5)")
261
- os.remove(os.path.join(folder, "login"))
262
- else:
263
- logger.info(f"Successfully downloaded {file_name}")
264
- break
265
-
266
- if login_detected_count >= 5:
267
- dummy_file_name = f"{file_name}.download_failed"
268
- dummy_file_path = os.path.join(folder, dummy_file_name)
269
- try:
270
- with open(dummy_file_path, "w") as f:
271
- f.write("Download failed after 5 attempts.")
272
- logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
273
- except Exception as e:
274
- logger.error(f"Failed to create dummy file for {file_name}: {e}")
275
-
276
- def download_old_version_file_and_upload(self, file_info, parent_folder: str, encrypted_top_name: str):
277
- """
278
- 古いバージョンのファイルを1つダウンロード→暗号化アップロード→削除。
279
- 「old_versions」というフォルダ名をそのまま使う。
280
- """
281
- file_name = file_info["name"]
282
- download_url = file_info["downloadUrl"]
283
-
284
- # old_versions フォルダを作成
285
- old_versions_folder = os.path.join(parent_folder, "old_versions")
286
- os.makedirs(old_versions_folder, exist_ok=True)
287
-
288
- local_path = os.path.join(old_versions_folder, file_name)
289
-
290
- login_detected_count = 0
291
- while login_detected_count < 5:
292
- try:
293
- # old_versionsフォルダへダウンロード
294
- self.download_file(download_url, old_versions_folder, file_name)
295
- except Exception as e:
296
- logger.error(f"Exception while downloading old file {file_name}: {e}")
297
- login_detected_count += 1
298
- continue
299
-
300
- # 万が一 "login" という名前のファイルが出現したら削除して再試行
301
- if "login" in os.listdir(old_versions_folder):
302
- login_detected_count += 1
303
- logger.warning(f"'login' file found while downloading {file_name}. Retry {login_detected_count}/5.")
304
- os.remove(os.path.join(old_versions_folder, "login"))
305
- else:
306
- logger.info(f"Successfully downloaded old version file: {file_name}")
307
- break
308
-
309
- if login_detected_count >= 5:
310
- # 5回失敗
311
- dummy_file_name = f"{file_name}.download_failed"
312
- dummy_file_path = os.path.join(old_versions_folder, dummy_file_name)
313
- try:
314
- with open(dummy_file_path, "w") as f:
315
- f.write("Download failed after 5 attempts.")
316
- logger.error(f"Failed to download {file_name} -> created dummy: {dummy_file_name}")
317
- except Exception as e:
318
- logger.error(f"Failed to create dummy file for old version {file_name}: {e}")
319
- return
320
-
321
- # ダウンロード成功 → 1ファイルだけ暗号化&アップロード → ローカル削除
322
- try:
323
- # 1) ファイル単位で暗号化するなら
324
- self.encrypt_with_rclone(local_path)
325
-
326
- # 2) 暗号化ファイルをアップロード
327
- self.upload_encrypted_files(
328
- repo_id=self.repo_ids["current"],
329
- base_path_in_repo=encrypted_top_name
330
- )
331
- logger.info(f"Uploaded old version file: {file_name} into {encrypted_top_name}")
332
-
333
- except Exception as e:
334
- logger.error(f"Error uploading old version file {file_name}: {e}")
335
-
336
- # 3) アップロード後、平文ファイルを削除
337
- if os.path.exists(local_path):
338
- os.remove(local_path)
339
- logger.info(f"Removed local old version file: {local_path}")
340
-
341
- def download_images(self, model_versions: list, folder: str):
342
- images_folder = os.path.join(folder, "images")
343
- os.makedirs(images_folder, exist_ok=True)
344
-
345
- images = []
346
- for version in model_versions:
347
- for img in version.get("images", []):
348
- image_url = img["url"]
349
- images.append(image_url)
350
-
351
- for image_url in images:
352
- image_name = image_url.split("/")[-1]
353
- try:
354
- response = requests.get(image_url)
355
- response.raise_for_status()
356
- with open(os.path.join(images_folder, f"{image_name}.png"), "wb") as file:
357
- file.write(response.content)
358
- except requests.RequestException as e:
359
- logger.error(f"Error downloading image {image_url}: {e}")
360
-
361
- def save_html_content(self, url: str, folder: str):
362
- try:
363
- response = requests.get(url)
364
- response.raise_for_status()
365
- html_path = os.path.join(folder, f"{folder}.html")
366
- with open(html_path, 'w', encoding='utf-8') as file:
367
- file.write(response.text)
368
- except Exception as e:
369
- logger.error(f"Error saving HTML content for URL {url}: {e}")
370
-
371
- @staticmethod
372
- def save_model_info(model_info: dict, folder: str):
373
- with open(os.path.join(folder, "model_info.json"), "w") as file:
374
- json.dump(model_info, file, indent=2)
375
-
376
- @staticmethod
377
- def increment_repo_name(repo_id: str) -> str:
378
- match = re.search(r'(\d+)$', repo_id)
379
- if match:
380
- number = int(match.group(1)) + 1
381
- return re.sub(r'\d+$', str(number), repo_id)
382
- else:
383
- return f"{repo_id}1"
384
-
385
- # =============================================================================
386
- # 暗号化しないアップロード(ログや model_list.log 用)
387
- # =============================================================================
388
- def upload_file_raw(
389
- self,
390
- file_path: str,
391
- repo_id: Optional[str] = None,
392
- path_in_repo: Optional[str] = None
393
- ):
394
- if repo_id is None:
395
- repo_id = self.repo_ids['current']
396
- if path_in_repo is None:
397
- path_in_repo = os.path.basename(file_path)
398
-
399
- max_retries = 5
400
- attempt = 0
401
- while attempt < max_retries:
402
- try:
403
- self.api.upload_file(
404
- path_or_fileobj=file_path,
405
- repo_id=repo_id,
406
- path_in_repo=path_in_repo
407
- )
408
- logger.info(f"[OK] Uploaded {file_path} => {repo_id}/{path_in_repo}")
409
- return
410
- except Exception as e:
411
- attempt += 1
412
- error_message = str(e)
413
- if "over the limit of 100000 files" in error_message:
414
- logger.warning("Repository file limit exceeded, creating a new repository.")
415
- self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
416
- self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
417
- attempt = 0
418
- repo_id = self.repo_ids['current']
419
- continue
420
- elif "you can retry this action in about 1 hour" in error_message:
421
- logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
422
- time.sleep(3600)
423
- attempt -= 1
424
- else:
425
- if attempt < max_retries:
426
- logger.warning(f"Failed to upload raw file {file_path}, retry {attempt}/{max_retries}...")
427
- else:
428
- logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}")
429
- raise
430
-
431
- # =============================================================================
432
- # 暗号化してアップロード (単ファイル)
433
- # =============================================================================
434
- def upload_file_encrypted(
435
- self,
436
- file_path: str,
437
- repo_id: Optional[str] = None,
438
- path_in_repo: Optional[str] = None
439
- ):
440
- if repo_id is None:
441
- repo_id = self.repo_ids['current']
442
- base_path = path_in_repo or ""
443
-
444
- self.encrypt_with_rclone(file_path)
445
- self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
446
-
447
- if os.path.isdir(self.config.ENCRYPTED_DIR):
448
- shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
449
-
450
- # =============================================================================
451
- # 暗号化してアップロード (フォルダ)
452
- # =============================================================================
453
- def upload_folder_encrypted(
454
- self,
455
- folder_path: str,
456
- repo_id: Optional[str] = None,
457
- path_in_repo: Optional[str] = None
458
- ) -> str:
459
- if repo_id is None:
460
- repo_id = self.repo_ids['current']
461
- base_path = path_in_repo or ""
462
-
463
- self.encrypt_with_rclone(folder_path)
464
-
465
- top_levels = [
466
- d for d in os.listdir(self.config.ENCRYPTED_DIR)
467
- if os.path.isdir(os.path.join(self.config.ENCRYPTED_DIR, d))
468
- ]
469
- if not top_levels:
470
- raise RuntimeError("No top-level folder found after rclone encryption.")
471
- if len(top_levels) > 1:
472
- logger.warning(f"Multiple top-level folders found after encryption? {top_levels}. Using the first one.")
473
-
474
- encrypted_top_name = top_levels[0]
475
-
476
- self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
477
-
478
- if os.path.isdir(self.config.ENCRYPTED_DIR):
479
- shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
480
-
481
- return encrypted_top_name
482
-
483
- # =============================================================================
484
- # model_list.log の読み書きを「model_id: model_hf_url」で扱うよう変更
485
- # =============================================================================
486
- def read_model_list(self):
487
- """
488
- model_list.log の各行を
489
- "123456: https://huggingface.co/...encrypted_folder_name"
490
- の形式で読み込み、 { "123456": "https://huggingface.co/..."} の dict を返す
491
- """
492
- model_list = {}
493
- try:
494
- with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
495
- for line in f:
496
- line = line.strip()
497
- if not line:
498
- continue
499
- parts = line.split(": ", 1)
500
- if len(parts) == 2:
501
- stored_id, stored_url = parts
502
- model_list[stored_id] = stored_url
503
- return model_list
504
- except Exception as e:
505
- logger.error(f"Failed to read model list: {e}")
506
- return {}
507
-
508
- def process_model(self, model_url: str):
509
- """
510
- - 最新バージョンをフォルダに一括DL → フォルダごと暗号化アップロード
511
- - 古いバージョンはファイル単位で即アップロード
512
- """
513
- try:
514
- model_id = model_url.rstrip("/").split("/")[-1]
515
- model_info = self.get_model_info(model_id)
516
-
517
- model_versions = model_info.get("modelVersions", [])
518
- if not model_versions:
519
- logger.warning(f"No versions found for model ID {model_id}")
520
- return
521
-
522
- # ==================================================================
523
- # 1) 「古いコード」と同じロジックでフォルダ名を決定
524
- # (最新バージョンのうち 'type' = 'Model' のファイルがあればそれ、なければ最初のファイル名)
525
- # ==================================================================
526
- latest_version = model_versions[0]
527
- model_file = next(
528
- (file for file in latest_version["files"] if file.get('type') == 'Model'),
529
- None
530
- )
531
- if model_file:
532
- latest_filename = model_file['name']
533
- folder = os.path.splitext(latest_filename)[0]
534
- else:
535
- first_file = latest_version["files"][0]
536
- latest_filename = first_file['name']
537
- folder = os.path.splitext(latest_filename)[0]
538
- logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
539
-
540
- os.makedirs(folder, exist_ok=True)
541
-
542
- # ==================================================================
543
- # 2) 最新バージョンをまとめてダウンロード フォルダごと暗号化アップロード
544
- # ==================================================================
545
- # 最新バージョンのファイルをまとめてダウンロード
546
- self.download_model(model_versions, folder)
547
-
548
- # 画像ダウンロード & HTML保存 & info.json保存
549
- self.download_images(model_versions, folder)
550
- self.save_html_content(model_url, folder)
551
- self.save_model_info(model_info, folder)
552
-
553
- # フォルダごと暗号化 → Hugging Face へアップ
554
- encrypted_top_name = self.upload_folder_encrypted(folder)
555
- logger.info(f"[MAIN] Uploaded latest version folder => {encrypted_top_name}")
556
-
557
- # ローカルフォルダ削���
558
- shutil.rmtree(folder, ignore_errors=True)
559
-
560
- # ==================================================================
561
- # 3) 古いバージョンのファイルを1つずつDL → 同じ暗号化フォルダに追加
562
- # ==================================================================
563
- if len(model_versions) > 1:
564
- for version in model_versions[1:]:
565
- for file_info in version["files"]:
566
- self.download_old_version_file_and_upload(file_info, encrypted_top_name)
567
-
568
- # ==================================================================
569
- # 4) model_list.log への登録やその他の処理
570
- # ==================================================================
571
- modelpage_name = model_info.get("name", f"Model_{model_id}")
572
- model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{encrypted_top_name}"
573
-
574
- with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
575
- f.write(f"{modelpage_name}: {model_hf_url}\n")
576
-
577
- except Exception as e:
578
- logger.error(f"Unexpected error in process_model ({model_url}): {e}")
579
-
580
- async def crawl(self):
581
- """モデルを定期的にチェックし、更新を行う。"""
582
- while True:
583
- try:
584
- login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
585
-
586
- # model_list.log & civitai_backup.log を取得
587
- model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
588
- shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
589
-
590
- local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
591
- shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
592
-
593
- # ログ読み込み
594
- with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
595
- lines = file.read().splitlines()
596
- old_models = json.loads(lines[0]) if len(lines) > 0 else []
597
- self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
598
-
599
- # 新着モデル確認
600
- response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
601
- response.raise_for_status()
602
- latest_models = response.json().get("items", [])
603
- latest_model_ids = [item.get("id") for item in latest_models if "id" in item]
604
-
605
- # 増分チェック
606
- new_models = list(set(latest_model_ids) - set(old_models))
607
-
608
- if new_models:
609
- logger.info(f"New models found: {new_models}")
610
- model_id = new_models[0]
611
-
612
- for attempt in range(1, 6):
613
- try:
614
- self.process_model(f"{self.config.URLS['modelId']}{model_id}")
615
- break
616
- except Exception as e:
617
- logger.error(f"Failed to process model ID {model_id} (Attempt {attempt}/5): {e}")
618
- if attempt == 5:
619
- logger.error(f"Skipping model ID {model_id} after 5 failed attempts.")
620
- else:
621
- await asyncio.sleep(2)
622
- else:
623
- # 新モデルなし
624
- with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
625
- f.write(json.dumps(latest_model_ids) + "\n")
626
- f.write(f"{self.repo_ids['current']}\n")
627
- logger.info(f"Updated log file: {self.config.LOG_FILE}")
628
-
629
- self.upload_file_raw(
630
- file_path=self.config.LOG_FILE,
631
- repo_id=self.repo_ids["log"],
632
- path_in_repo=self.config.LOG_FILE
633
- )
634
- logger.info("Uploaded log file to repository (unencrypted).")
635
-
636
- logger.info("No new models found.")
637
- await asyncio.sleep(60)
638
- continue
639
-
640
- # 追加したモデルIDを old_models に追加
641
- old_models.append(model_id)
642
-
643
- # ログファイル更新
644
- with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
645
- f.write(json.dumps(old_models) + "\n")
646
- f.write(f"{self.repo_ids['current']}\n")
647
- logger.info(f"Updated log file with new model ID: {model_id}")
648
-
649
- # ログとmodel_list.logをアップロード
650
- self.upload_file_raw(
651
- file_path=self.config.LOG_FILE,
652
- repo_id=self.repo_ids["log"],
653
- path_in_repo=self.config.LOG_FILE
654
- )
655
- self.upload_file_raw(
656
- file_path=self.config.LIST_FILE,
657
- repo_id=self.repo_ids["model_list"],
658
- path_in_repo=self.config.LIST_FILE
659
- )
660
-
661
- except Exception as e:
662
- logger.error(f"Error during crawling: {e}")
663
- await asyncio.sleep(300)
664
-
665
-
666
- # 実行
667
- config = Config()
668
- crawler = CivitAICrawler(config)
669
- app = crawler.app
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import datetime
4
+ import json
5
+ import logging
6
+ import os
7
+ import re
8
+ import shutil
9
+ import subprocess
10
+ import time
11
+ from typing import Optional
12
+
13
+ import requests
14
+ from bs4 import BeautifulSoup
15
+ from fake_useragent import UserAgent
16
+ from fastapi import FastAPI
17
+ from huggingface_hub import HfApi, create_repo, hf_hub_download, login
18
+
19
+ # ロギングの設定
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class Config:
25
+ """設定用のクラス"""
26
+ HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
27
+ CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
28
+ LOG_FILE = "civitai_backup.log"
29
+ LIST_FILE = "model_list.log"
30
+ REPO_IDS = {
31
+ "log": "tktkdrrrrrrrrrrr/CivitAI_log",
32
+ "model_list": "tktkdrrrrrrrrrrr/CivitAI_model_info",
33
+ "current": ""
34
+ }
35
+ URLS = {
36
+ "latest": "https://civitai.com/api/v1/models?sort=Newest",
37
+ "modelPage": "https://civitai.com/models/",
38
+ "modelId": "https://civitai.com/api/v1/models/",
39
+ "modelVersionId": "https://civitai.com/api/v1/model-versions/",
40
+ "hash": "https://civitai.com/api/v1/model-versions/by-hash/"
41
+ }
42
+ JST = datetime.timezone(datetime.timedelta(hours=9))
43
+ UA = UserAgent()
44
+ HEADERS = {
45
+ 'Authorization': f'Bearer {CIVITAI_API_TOKEN}',
46
+ 'User-Agent': 'civitai-crawler/1.0',
47
+ "Content-Type": "application/json"
48
+ }
49
+
50
+ # ===== rclone 用の追加設定 =====
51
+ RCLONE_CONF_BASE64 = os.environ.get("RCLONE_CONF_BASE64", "")
52
+ # 暗号化されたファイルが出力されるローカルディレクトリ(cryptLocal: の実体)
53
+ ENCRYPTED_DIR = "/home/user/app/encrypted"
54
+
55
+
56
+ class CivitAICrawler:
57
+ """CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス"""
58
+
59
+ def __init__(self, config: Config):
60
+ self.config = config
61
+ self.api = HfApi()
62
+ self.app = FastAPI()
63
+ self.repo_ids = self.config.REPO_IDS.copy()
64
+ self.jst = self.config.JST
65
+
66
+ # rclone のセットアップ
67
+ self.setup_rclone_conf()
68
+
69
+ self.setup_routes()
70
+
71
+ def setup_routes(self):
72
+ """FastAPIのルーティングを設定する。"""
73
+ @self.app.get("/")
74
+ def read_root():
75
+ now = str(datetime.datetime.now(self.jst))
76
+ description = f"""
77
+ CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするSpaceです。
78
+ model_list.log や civitai_backup.log は暗号化しないでアップロードします。
79
+ モデルのフォルダやファイルは暗号化してアップロードします。
80
+ Status: {now} + currently running :D
81
+ """
82
+ return description
83
+
84
+ @self.app.on_event("startup")
85
+ async def startup_event():
86
+ asyncio.create_task(self.crawl())
87
+
88
+ # =============================================================================
89
+ # rclone の設定・暗号化アップロード処理
90
+ # =============================================================================
91
+ def setup_rclone_conf(self):
92
+ """環境変数 RCLONE_CONF_BASE64 から rclone.conf を生成し、RCLONE_CONFIG 環境変数を設定"""
93
+ if not self.config.RCLONE_CONF_BASE64:
94
+ logger.warning("[WARN] RCLONE_CONF_BASE64 is empty. rclone may fail.")
95
+ return
96
+
97
+ os.makedirs(".rclone_config", exist_ok=True)
98
+ conf_path = os.path.join(".rclone_config", "rclone.conf")
99
+ with open(conf_path, "wb") as f:
100
+ f.write(base64.b64decode(self.config.RCLONE_CONF_BASE64))
101
+
102
+ os.environ["RCLONE_CONFIG"] = conf_path
103
+ logger.info(f"[INFO] rclone.conf created at: {conf_path}")
104
+
105
+ def encrypt_with_rclone(self, local_path: str):
106
+ """
107
+ 指定ファイル or ディレクトリを cryptLocal: にコピー。
108
+ フォルダ構造やファイル名を rclone の filename_encryption 設定に応じて暗号化する。
109
+ """
110
+ if not os.path.exists(local_path):
111
+ raise FileNotFoundError(f"[ERROR] Local path not found: {local_path}")
112
+
113
+ # 事前に暗号先ディレクトリをクリーンアップ
114
+ if os.path.isdir(self.config.ENCRYPTED_DIR):
115
+ shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
116
+
117
+ top_level_name = os.path.basename(local_path.rstrip("/"))
118
+ if not top_level_name:
119
+ top_level_name = "unnamed"
120
+
121
+ cmd = ["rclone", "copy", local_path, f"cryptLocal:{top_level_name}", "-v"]
122
+ logger.info(f"[INFO] Running: {' '.join(cmd)}")
123
+ subprocess.run(cmd, check=True)
124
+ logger.info(f"[OK] rclone copy => cryptLocal:{top_level_name}")
125
+
126
+ if not os.path.isdir(self.config.ENCRYPTED_DIR):
127
+ raise FileNotFoundError(
128
+ f"[ERROR] {self.config.ENCRYPTED_DIR} not found. Check your rclone config."
129
+ )
130
+
131
+ # 例: upload_encrypted_files の中の再試行処理
132
+ def upload_encrypted_files(self, repo_id: str, base_path_in_repo: str = ""):
133
+ max_retries = 5
134
+ for root, dirs, files in os.walk(self.config.ENCRYPTED_DIR):
135
+ for fn in files:
136
+ encrypted_file_path = os.path.join(root, fn)
137
+ if not os.path.isfile(encrypted_file_path):
138
+ continue
139
+
140
+ relative_path = os.path.relpath(encrypted_file_path, self.config.ENCRYPTED_DIR)
141
+ upload_path_in_repo = os.path.join(base_path_in_repo, relative_path)
142
+
143
+ attempt = 0
144
+ while attempt < max_retries:
145
+ try:
146
+ self.api.upload_file(
147
+ path_or_fileobj=encrypted_file_path,
148
+ repo_id=repo_id,
149
+ path_in_repo=upload_path_in_repo
150
+ )
151
+ logger.info(f"[OK] Uploaded => {repo_id}/{upload_path_in_repo}")
152
+ break
153
+
154
+ except Exception as e:
155
+ attempt += 1
156
+ error_message = str(e)
157
+
158
+ # ================================
159
+ # 429によるrate-limit検出追加
160
+ # ================================
161
+ # "You have been rate-limited; you can retry this action in 31 minutes."
162
+ # のようなメッセージから時間を抽出し、その時間+1分だけ待機後、再試行
163
+ if "rate-limited" in error_message and "minutes" in error_message:
164
+ import re
165
+ match = re.search(r"in (\d+) minutes?", error_message)
166
+ if match:
167
+ minutes = int(match.group(1))
168
+ # +1分して待機
169
+ minutes += 1
170
+ logger.warning(f"Rate-limited. Waiting {minutes} minutes before retry...")
171
+ time.sleep(minutes * 60)
172
+ attempt -= 1 # 同じ attempt カウントで再試行
173
+ continue
174
+
175
+ # ================================
176
+ # すでにある1時間待機処理
177
+ # ================================
178
+ if "you can retry this action in about 1 hour" in error_message:
179
+ logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
180
+ time.sleep(3600)
181
+ attempt -= 1 # 再試行回数を増やさずにループを続ける
182
+ continue
183
+
184
+ if "over the limit of 100000 files" in error_message:
185
+ logger.warning("Repository file limit exceeded. Creating a new repository...")
186
+ self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
187
+ self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
188
+ attempt = 0
189
+ repo_id = self.repo_ids['current']
190
+ continue
191
+
192
+ # 上記以外のエラーの場合
193
+ if attempt < max_retries:
194
+ logger.warning(
195
+ f"Failed to upload {encrypted_file_path}, retry {attempt}/{max_retries}..."
196
+ )
197
+ else:
198
+ logger.error(
199
+ f"Failed to upload after {max_retries} attempts: {encrypted_file_path}"
200
+ )
201
+ raise
202
+
203
+ @staticmethod
204
+ def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
205
+ if content_disposition:
206
+ parts = content_disposition.split(';')
207
+ for part in parts:
208
+ if "filename=" in part:
209
+ return part.split("=")[1].strip().strip('"')
210
+ return default_name
211
+
212
+ def download_file(self, url: str, destination_folder: str, default_name: str):
213
+ try:
214
+ response = requests.get(url, headers=self.config.HEADERS, stream=True)
215
+ response.raise_for_status()
216
+ except requests.RequestException as e:
217
+ logger.error(f"Failed to download file from {url}: {e}")
218
+ return
219
+
220
+ filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
221
+ file_path = os.path.join(destination_folder, filename)
222
+
223
+ with open(file_path, 'wb') as file:
224
+ for chunk in response.iter_content(chunk_size=8192):
225
+ file.write(chunk)
226
+ logger.info(f"Download completed: {file_path}")
227
+
228
+ def get_model_info(self, model_id: str) -> dict:
229
+ try:
230
+ response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
231
+ response.raise_for_status()
232
+ return response.json()
233
+ except requests.RequestException as e:
234
+ logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
235
+
236
+ def download_model(self, model_versions: list, folder: str):
237
+ """
238
+ - 最新バージョンのファイルを「folder」へまとめてダウンロード
239
+ - 古いバージョンはまだダウンロードしない (実は後から個別にダウンロード)
240
+ (※今回は「まとめて old_versions フォルダに入れずに、1ファイルずつ別のメソッドで対処する」流れに)
241
+ """
242
+ latest_version = model_versions[0]
243
+ latest_files = latest_version["files"]
244
+
245
+ for file_info in latest_files:
246
+ download_url = file_info["downloadUrl"]
247
+ file_name = file_info["name"]
248
+ login_detected_count = 0
249
+
250
+ while login_detected_count < 5:
251
+ try:
252
+ self.download_file(download_url, folder, file_name)
253
+ except Exception as e:
254
+ logger.error(f"Exception occurred while downloading {file_name}: {e}")
255
+ login_detected_count += 1
256
+ continue
257
+
258
+ if "login" in os.listdir(folder):
259
+ login_detected_count += 1
260
+ logger.warning(f"'login' file found. Will try again. ({login_detected_count}/5)")
261
+ os.remove(os.path.join(folder, "login"))
262
+ else:
263
+ logger.info(f"Successfully downloaded {file_name}")
264
+ break
265
+
266
+ if login_detected_count >= 5:
267
+ dummy_file_name = f"{file_name}.download_failed"
268
+ dummy_file_path = os.path.join(folder, dummy_file_name)
269
+ try:
270
+ with open(dummy_file_path, "w") as f:
271
+ f.write("Download failed after 5 attempts.")
272
+ logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
273
+ except Exception as e:
274
+ logger.error(f"Failed to create dummy file for {file_name}: {e}")
275
+
276
+ def download_old_version_file_and_upload(self, file_info, parent_folder: str, encrypted_top_name: str):
277
+ """
278
+ 古いバージョンのファイルを1つダウンロード→暗号化アップロード→削除。
279
+ 「old_versions」というフォルダ名をそのまま使う。
280
+ """
281
+ file_name = file_info["name"]
282
+ download_url = file_info["downloadUrl"]
283
+
284
+ # old_versions フォルダを作成
285
+ old_versions_folder = os.path.join(parent_folder, "old_versions")
286
+ os.makedirs(old_versions_folder, exist_ok=True)
287
+
288
+ local_path = os.path.join(old_versions_folder, file_name)
289
+
290
+ login_detected_count = 0
291
+ while login_detected_count < 5:
292
+ try:
293
+ # old_versionsフォルダへダウンロード
294
+ self.download_file(download_url, old_versions_folder, file_name)
295
+ except Exception as e:
296
+ logger.error(f"Exception while downloading old file {file_name}: {e}")
297
+ login_detected_count += 1
298
+ continue
299
+
300
+ # 万が一 "login" という名前のファイルが出現したら削除して再試行
301
+ if "login" in os.listdir(old_versions_folder):
302
+ login_detected_count += 1
303
+ logger.warning(f"'login' file found while downloading {file_name}. Retry {login_detected_count}/5.")
304
+ os.remove(os.path.join(old_versions_folder, "login"))
305
+ else:
306
+ logger.info(f"Successfully downloaded old version file: {file_name}")
307
+ break
308
+
309
+ if login_detected_count >= 5:
310
+ # 5回失敗
311
+ dummy_file_name = f"{file_name}.download_failed"
312
+ dummy_file_path = os.path.join(old_versions_folder, dummy_file_name)
313
+ try:
314
+ with open(dummy_file_path, "w") as f:
315
+ f.write("Download failed after 5 attempts.")
316
+ logger.error(f"Failed to download {file_name} -> created dummy: {dummy_file_name}")
317
+ except Exception as e:
318
+ logger.error(f"Failed to create dummy file for old version {file_name}: {e}")
319
+ return
320
+
321
+ # ダウンロード成功 → 1ファイルだけ暗号化&アップロード → ローカル削除
322
+ try:
323
+ # 1) ファイル単位で暗号化するなら
324
+ self.encrypt_with_rclone(local_path)
325
+
326
+ # 2) 暗号化ファイルをアップロード
327
+ self.upload_encrypted_files(
328
+ repo_id=self.repo_ids["current"],
329
+ base_path_in_repo=encrypted_top_name
330
+ )
331
+ logger.info(f"Uploaded old version file: {file_name} into {encrypted_top_name}")
332
+
333
+ except Exception as e:
334
+ logger.error(f"Error uploading old version file {file_name}: {e}")
335
+
336
+ # 3) アップロード後、平文ファイルを削除
337
+ if os.path.exists(local_path):
338
+ os.remove(local_path)
339
+ logger.info(f"Removed local old version file: {local_path}")
340
+
341
+ def download_images(self, model_versions: list, folder: str):
342
+ images_folder = os.path.join(folder, "images")
343
+ os.makedirs(images_folder, exist_ok=True)
344
+
345
+ images = []
346
+ for version in model_versions:
347
+ for img in version.get("images", []):
348
+ image_url = img["url"]
349
+ images.append(image_url)
350
+
351
+ for image_url in images:
352
+ image_name = image_url.split("/")[-1]
353
+ try:
354
+ response = requests.get(image_url)
355
+ response.raise_for_status()
356
+ with open(os.path.join(images_folder, f"{image_name}.png"), "wb") as file:
357
+ file.write(response.content)
358
+ except requests.RequestException as e:
359
+ logger.error(f"Error downloading image {image_url}: {e}")
360
+
361
+ def save_html_content(self, url: str, folder: str):
362
+ try:
363
+ response = requests.get(url)
364
+ response.raise_for_status()
365
+ html_path = os.path.join(folder, f"{folder}.html")
366
+ with open(html_path, 'w', encoding='utf-8') as file:
367
+ file.write(response.text)
368
+ except Exception as e:
369
+ logger.error(f"Error saving HTML content for URL {url}: {e}")
370
+
371
+ @staticmethod
372
+ def save_model_info(model_info: dict, folder: str):
373
+ with open(os.path.join(folder, "model_info.json"), "w") as file:
374
+ json.dump(model_info, file, indent=2)
375
+
376
+ @staticmethod
377
+ def increment_repo_name(repo_id: str) -> str:
378
+ match = re.search(r'(\d+)$', repo_id)
379
+ if match:
380
+ number = int(match.group(1)) + 1
381
+ return re.sub(r'\d+$', str(number), repo_id)
382
+ else:
383
+ return f"{repo_id}1"
384
+
385
+ # =============================================================================
386
+ # 暗号化しないアップロード(ログや model_list.log 用)
387
+ # =============================================================================
388
+ def upload_file_raw(
389
+ self,
390
+ file_path: str,
391
+ repo_id: Optional[str] = None,
392
+ path_in_repo: Optional[str] = None
393
+ ):
394
+ if repo_id is None:
395
+ repo_id = self.repo_ids['current']
396
+ if path_in_repo is None:
397
+ path_in_repo = os.path.basename(file_path)
398
+
399
+ max_retries = 5
400
+ attempt = 0
401
+ while attempt < max_retries:
402
+ try:
403
+ self.api.upload_file(
404
+ path_or_fileobj=file_path,
405
+ repo_id=repo_id,
406
+ path_in_repo=path_in_repo
407
+ )
408
+ logger.info(f"[OK] Uploaded {file_path} => {repo_id}/{path_in_repo}")
409
+ return
410
+ except Exception as e:
411
+ attempt += 1
412
+ error_message = str(e)
413
+ if "over the limit of 100000 files" in error_message:
414
+ logger.warning("Repository file limit exceeded, creating a new repository.")
415
+ self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
416
+ self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
417
+ attempt = 0
418
+ repo_id = self.repo_ids['current']
419
+ continue
420
+ elif "you can retry this action in about 1 hour" in error_message:
421
+ logger.warning("Encountered 'retry in 1 hour' error. Waiting 1 hour before retrying...")
422
+ time.sleep(3600)
423
+ attempt -= 1
424
+ else:
425
+ if attempt < max_retries:
426
+ logger.warning(f"Failed to upload raw file {file_path}, retry {attempt}/{max_retries}...")
427
+ else:
428
+ logger.error(f"Failed to upload raw file after {max_retries} attempts: {file_path}")
429
+ raise
430
+
431
+ # =============================================================================
432
+ # 暗号化してアップロード (単ファイル)
433
+ # =============================================================================
434
+ def upload_file_encrypted(
435
+ self,
436
+ file_path: str,
437
+ repo_id: Optional[str] = None,
438
+ path_in_repo: Optional[str] = None
439
+ ):
440
+ if repo_id is None:
441
+ repo_id = self.repo_ids['current']
442
+ base_path = path_in_repo or ""
443
+
444
+ self.encrypt_with_rclone(file_path)
445
+ self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
446
+
447
+ if os.path.isdir(self.config.ENCRYPTED_DIR):
448
+ shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
449
+
450
+ # =============================================================================
451
+ # 暗号化してアップロード (フォルダ)
452
+ # =============================================================================
453
+ def upload_folder_encrypted(
454
+ self,
455
+ folder_path: str,
456
+ repo_id: Optional[str] = None,
457
+ path_in_repo: Optional[str] = None
458
+ ) -> str:
459
+ if repo_id is None:
460
+ repo_id = self.repo_ids['current']
461
+ base_path = path_in_repo or ""
462
+
463
+ self.encrypt_with_rclone(folder_path)
464
+
465
+ top_levels = [
466
+ d for d in os.listdir(self.config.ENCRYPTED_DIR)
467
+ if os.path.isdir(os.path.join(self.config.ENCRYPTED_DIR, d))
468
+ ]
469
+ if not top_levels:
470
+ raise RuntimeError("No top-level folder found after rclone encryption.")
471
+ if len(top_levels) > 1:
472
+ logger.warning(f"Multiple top-level folders found after encryption? {top_levels}. Using the first one.")
473
+
474
+ encrypted_top_name = top_levels[0]
475
+
476
+ self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
477
+
478
+ if os.path.isdir(self.config.ENCRYPTED_DIR):
479
+ shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
480
+
481
+ return encrypted_top_name
482
+
483
+ # =============================================================================
484
+ # model_list.log の読み書きを「model_id: model_hf_url」で扱うよう変更
485
+ # =============================================================================
486
+ def read_model_list(self):
487
+ """
488
+ model_list.log の各行を
489
+ "123456: https://huggingface.co/...encrypted_folder_name"
490
+ の形式で読み込み、 { "123456": "https://huggingface.co/..."} の dict を返す
491
+ """
492
+ model_list = {}
493
+ try:
494
+ with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
495
+ for line in f:
496
+ line = line.strip()
497
+ if not line:
498
+ continue
499
+ parts = line.split(": ", 1)
500
+ if len(parts) == 2:
501
+ stored_id, stored_url = parts
502
+ model_list[stored_id] = stored_url
503
+ return model_list
504
+ except Exception as e:
505
+ logger.error(f"Failed to read model list: {e}")
506
+ return {}
507
+
508
+ def process_model(self, model_url: str):
509
+ """
510
+ - 最新バージョン(+ images, html, info.json)をフォルダに一括DL → 最後にまとめて暗号化アップロード
511
+ - old_versions のファイルは 1つずつ「old_versions/」へDL → 即暗号化アップロード
512
+ """
513
+ try:
514
+ model_id = model_url.rstrip("/").split("/")[-1]
515
+ model_info = self.get_model_info(model_id)
516
+
517
+ model_versions = model_info.get("modelVersions", [])
518
+ if not model_versions:
519
+ logger.warning(f"No versions found for model ID {model_id}")
520
+ return
521
+
522
+ # ==================================================================
523
+ # 1) 「古いコード」と同じロジックでフォルダ名を決定
524
+ # ==================================================================
525
+ latest_version = model_versions[0]
526
+ model_file = next(
527
+ (file for file in latest_version["files"] if file.get('type') == 'Model'),
528
+ None
529
+ )
530
+ if model_file:
531
+ latest_filename = model_file['name']
532
+ folder = os.path.splitext(latest_filename)[0]
533
+ else:
534
+ first_file = latest_version["files"][0]
535
+ latest_filename = first_file['name']
536
+ folder = os.path.splitext(latest_filename)[0]
537
+ logger.warning(f"No 'Model' type file found for model ID {model_id}. Using first file's name.")
538
+
539
+ os.makedirs(folder, exist_ok=True)
540
+
541
+ # ==================================================================
542
+ # 2) 最新バージョン��まとめてダウンロード (model本体)
543
+ # + 画像ダウンロード + HTML保存 + info.json保存
544
+ # ==================================================================
545
+ self.download_model(model_versions, folder)
546
+ self.download_images(model_versions, folder)
547
+ self.save_html_content(model_url, folder)
548
+ self.save_model_info(model_info, folder)
549
+
550
+ # ==================================================================
551
+ # 3) フォルダ全体(最新バージョン + images + html + info.json)を
552
+ # まとめて 1 回だけ暗号化→アップロード
553
+ # ==================================================================
554
+ encrypted_top_name = self.upload_folder_encrypted(folder)
555
+ logger.info(f"[MAIN] Uploaded latest version folder => {encrypted_top_name}")
556
+
557
+ # ローカルフォルダ削除
558
+ shutil.rmtree(folder, ignore_errors=True)
559
+
560
+ # ==================================================================
561
+ # 4) 古いバージョンのファイルを 1つずつDL → 即アップロード
562
+ # ==================================================================
563
+ if len(model_versions) > 1:
564
+ # 旧バージョンのファイルだけ個別処理
565
+ for version in model_versions[1:]:
566
+ for file_info in version["files"]:
567
+ # ***** 修正ポイント *****
568
+ # download_old_version_file_and_upload(file_info, parent_folder, encrypted_top_name)
569
+ self.download_old_version_file_and_upload(file_info, folder, encrypted_top_name)
570
+ # ↑ parent_folder=folder に old_versionsフォルダを作り
571
+ # そこへダウンロードしてから暗号化アップロード
572
+
573
+ # ==================================================================
574
+ # 5) model_list.log への登録やその他の処理
575
+ # ==================================================================
576
+ modelpage_name = model_info.get("name", f"Model_{model_id}")
577
+ model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{encrypted_top_name}"
578
+
579
+ with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
580
+ f.write(f"{modelpage_name}: {model_hf_url}\n")
581
+
582
+ except Exception as e:
583
+ logger.error(f"Unexpected error in process_model ({model_url}): {e}")
584
+
585
+ async def crawl(self):
586
+ """モデルを定期的にチェックし、更新を行う。"""
587
+ while True:
588
+ try:
589
+ login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
590
+
591
+ # model_list.log & civitai_backup.log を取得
592
+ model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
593
+ shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
594
+
595
+ local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
596
+ shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
597
+
598
+ # ログ読み込み
599
+ with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
600
+ lines = file.read().splitlines()
601
+ old_models = json.loads(lines[0]) if len(lines) > 0 else []
602
+ self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
603
+
604
+ # 新着モデル確認
605
+ response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
606
+ response.raise_for_status()
607
+ latest_models = response.json().get("items", [])
608
+ latest_model_ids = [item.get("id") for item in latest_models if "id" in item]
609
+
610
+ # 増分チェック
611
+ new_models = list(set(latest_model_ids) - set(old_models))
612
+
613
+ if new_models:
614
+ logger.info(f"New models found: {new_models}")
615
+ model_id = new_models[0]
616
+
617
+ for attempt in range(1, 6):
618
+ try:
619
+ self.process_model(f"{self.config.URLS['modelId']}{model_id}")
620
+ break
621
+ except Exception as e:
622
+ logger.error(f"Failed to process model ID {model_id} (Attempt {attempt}/5): {e}")
623
+ if attempt == 5:
624
+ logger.error(f"Skipping model ID {model_id} after 5 failed attempts.")
625
+ else:
626
+ await asyncio.sleep(2)
627
+ else:
628
+ # 新モデルなし
629
+ with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
630
+ f.write(json.dumps(latest_model_ids) + "\n")
631
+ f.write(f"{self.repo_ids['current']}\n")
632
+ logger.info(f"Updated log file: {self.config.LOG_FILE}")
633
+
634
+ self.upload_file_raw(
635
+ file_path=self.config.LOG_FILE,
636
+ repo_id=self.repo_ids["log"],
637
+ path_in_repo=self.config.LOG_FILE
638
+ )
639
+ logger.info("Uploaded log file to repository (unencrypted).")
640
+
641
+ logger.info("No new models found.")
642
+ await asyncio.sleep(60)
643
+ continue
644
+
645
+ # 追加したモデルIDを old_models に追加
646
+ old_models.append(model_id)
647
+
648
+ # ログファイル更新
649
+ with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
650
+ f.write(json.dumps(old_models) + "\n")
651
+ f.write(f"{self.repo_ids['current']}\n")
652
+ logger.info(f"Updated log file with new model ID: {model_id}")
653
+
654
+ # ログとmodel_list.logをアップロード
655
+ self.upload_file_raw(
656
+ file_path=self.config.LOG_FILE,
657
+ repo_id=self.repo_ids["log"],
658
+ path_in_repo=self.config.LOG_FILE
659
+ )
660
+ self.upload_file_raw(
661
+ file_path=self.config.LIST_FILE,
662
+ repo_id=self.repo_ids["model_list"],
663
+ path_in_repo=self.config.LIST_FILE
664
+ )
665
+
666
+ except Exception as e:
667
+ logger.error(f"Error during crawling: {e}")
668
+ await asyncio.sleep(300)
669
+
670
+
671
+ # 実行
672
+ config = Config()
673
+ crawler = CivitAICrawler(config)
674
+ app = crawler.app