ttttdiva commited on
Commit
7d21919
·
verified ·
1 Parent(s): 7d6cf26

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +643 -95
main.py CHANGED
@@ -1,97 +1,645 @@
1
- def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
2
- """
3
- 1. /home/user/app/encrypted 配下の古いファイルやフォルダを消去(オプション)
4
- 2. rclone mkdir cryptLocal:subfolder_label (空ディレクトリを必ず作る)
5
- 3. rclone copy local_folder => cryptLocal:subfolder_label
6
- /home/user/app/encrypted/<暗号フォルダ>/ にコピー
7
- 4. そのフォルダを self.upload_folder() (=HFにアップロード)
8
- 5. ローカル(平文フォルダ & 暗号フォルダ)削除
9
- 6. 最後に subfolder_label(論理上のフォルダ名)を return
10
- """
11
- if not os.path.exists(local_folder):
12
- logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
13
- return None
14
-
15
- encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
16
- os.makedirs(encrypted_base_dir, exist_ok=True)
17
-
18
- # 1) 古い暗号ファイルやフォルダを削除
19
- for item in os.listdir(encrypted_base_dir):
20
- item_path = os.path.join(encrypted_base_dir, item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  try:
22
- if os.path.isfile(item_path) or os.path.islink(item_path):
23
- os.remove(item_path)
24
- else:
25
- shutil.rmtree(item_path)
26
- logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
 
27
  except Exception as e:
28
- logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
29
-
30
- # 2) まず mkdir で「暗号リモート上に空ディレクトリ」を作る
31
- subfolder_label = "enc_" + str(uuid.uuid4())[:8]
32
- try:
33
- subprocess.run(
34
- ["rclone", "mkdir", f"cryptLocal:{subfolder_label}"],
35
- check=True
36
- )
37
- logger.info(f"[OK] rclone mkdir cryptLocal:{subfolder_label}")
38
- except subprocess.CalledProcessError as e:
39
- logger.error(f"rclone mkdir failed: {e}")
40
- return None
41
-
42
- # 3) local_folder -> cryptLocal:subfolder_label にコピー
43
- # --create-empty-src-dirs: 空フォルダ構造も含めてコピー
44
- try:
45
- subprocess.run(
46
- [
47
- "rclone", "copy",
48
- local_folder,
49
- f"cryptLocal:{subfolder_label}",
50
- "--create-empty-src-dirs"
51
- ],
52
- check=True
53
- )
54
- logger.info(f"[OK] rclone copy {local_folder} => cryptLocal:{subfolder_label}")
55
- except subprocess.CalledProcessError as e:
56
- logger.error(f"rclone copy failed: {e}")
57
- return None
58
-
59
- # cryptLocal:{subfolder_label} => 実際には /home/user/app/encrypted/<ランダム文字列> のフォルダになる
60
- # それを差分検知で特定
61
- before_dirs = set(os.listdir(encrypted_base_dir))
62
- # mkdir/copy 直後に何らかのズレがあるかもしれないので再度 mkdir しなくてOK
63
- # すでに上でやってるため、ここでやるなら:
64
- # time.sleep(1) などで間を置いてみる手もある
65
-
66
- after_dirs = set(os.listdir(encrypted_base_dir))
67
- diff = after_dirs - before_dirs
68
- # もし diff が空なら既にあるフォルダに上書きコピーされた可能性
69
- if not diff:
70
- logger.error("[ERROR] No new directory appeared in ./encrypted after rclone copy.")
71
- return None
72
- if len(diff) > 1:
73
- logger.warning(f"[WARN] Multiple new directories found: {diff}, picking the first one.")
74
- enc_folder_name = diff.pop()
75
- enc_folder_path = os.path.join(encrypted_base_dir, enc_folder_name)
76
-
77
- if not os.path.isdir(enc_folder_path):
78
- logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
79
- return None
80
-
81
- # 4) アップロード (フォルダとして)
82
- try:
83
- self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
84
- logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
85
- except Exception as e:
86
- logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
87
-
88
- # 5) ローカル削除
89
- try:
90
- shutil.rmtree(local_folder)
91
- shutil.rmtree(enc_folder_path)
92
- logger.info(f"Removed local folder: {local_folder} and {enc_folder_path}")
93
- except Exception as e:
94
- logger.error(f"Failed to remove local folders: {e}")
95
-
96
- # 6) 論理上のフォルダ名(rclone上の名称)を返す
97
- return subfolder_label
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import datetime
3
+ import json
4
+ import logging
5
+ import os
6
+ import re
7
+ import shutil
8
+ import subprocess
9
+ import time
10
+ import uuid
11
+ from typing import Optional
12
+
13
+ import requests
14
+ from bs4 import BeautifulSoup
15
+ from fake_useragent import UserAgent
16
+ from fastapi import FastAPI
17
+ from huggingface_hub import HfApi, hf_hub_download, login
18
+
19
+ # ロギングの設定
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class Config:
25
+ """設定用のクラス"""
26
+ HUGGINGFACE_API_KEY = os.environ["HUGGINGFACE_API_KEY"]
27
+ CIVITAI_API_TOKEN = os.environ["CIVITAI_API_TOKEN"]
28
+ LOG_FILE = "civitai_backup.log"
29
+ LIST_FILE = "model_list.log"
30
+ REPO_IDS = {
31
+ "log": "ttttdiva/CivitAI_log_test",
32
+ "model_list": "ttttdiva/CivitAI_model_info_test",
33
+ "current": ""
34
+ }
35
+ URLS = {
36
+ "latest": "https://civitai.com/api/v1/models?sort=Newest",
37
+ "modelPage": "https://civitai.com/models/",
38
+ "modelId": "https://civitai.com/api/v1/models/",
39
+ "modelVersionId": "https://civitai.com/api/v1/model-versions/",
40
+ "hash": "https://civitai.com/api/v1/model-versions/by-hash/"
41
+ }
42
+ JST = datetime.timezone(datetime.timedelta(hours=9))
43
+ UA = UserAgent()
44
+ HEADERS = {
45
+ 'Authorization': f'Bearer {CIVITAI_API_TOKEN}',
46
+ 'User-Agent': UA.random,
47
+ "Content-Type": "application/json"
48
+ }
49
+
50
+
51
+ class CivitAICrawler:
52
+ """CivitAIからモデルをダウンロードし、Hugging Faceにアップロードするクラス"""
53
+
54
+ def __init__(self, config: Config):
55
+ import base64
56
+
57
+ rclone_conf_base64 = os.environ.get("RCLONE_CONF_BASE64")
58
+ if rclone_conf_base64:
59
+ # カレントディレクトリ配下に .rclone_config ディレクトリを作成
60
+ config_dir = os.path.join(os.getcwd(), ".rclone_config")
61
+ os.makedirs(config_dir, exist_ok=True)
62
+
63
+ conf_path = os.path.join(config_dir, "rclone.conf")
64
+ with open(conf_path, "wb") as f:
65
+ f.write(base64.b64decode(rclone_conf_base64))
66
+
67
+ # rclone がここを参照するように設定
68
+ os.environ["RCLONE_CONFIG"] = conf_path
69
+ logger.info(f"[INFO] Created rclone.conf at {conf_path}")
70
+ else:
71
+ logger.warning("[WARN] RCLONE_CONF_BASE64 not found; rclone may fail.")
72
+
73
+ self.config = config
74
+ self.api = HfApi()
75
+ self.app = FastAPI()
76
+ self.repo_ids = self.config.REPO_IDS.copy()
77
+ self.jst = self.config.JST
78
+ self.setup_routes()
79
+
80
+ def setup_routes(self):
81
+ """FastAPIのルーティングを設定する。"""
82
+ @self.app.get("/")
83
+ def read_root():
84
+ now = str(datetime.datetime.now(self.jst))
85
+ description = f"""
86
+ CivitAIを定期的に周回し新規モデルを {self.repo_ids['current']} にバックアップするspaceです。
87
+ モデル一覧は https://huggingface.co/{self.repo_ids['model_list']}/blob/main/model_list.log を参照してください。
88
+ Status: {now} + currently running :D
89
+ """
90
+ return description
91
+
92
+ @self.app.on_event("startup")
93
+ async def startup_event():
94
+ asyncio.create_task(self.crawl())
95
+
96
+ @staticmethod
97
+ def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
98
+ """Content-Dispositionヘッダーからファイル名を取得する。"""
99
+ if content_disposition:
100
+ parts = content_disposition.split(';')
101
+ for part in parts:
102
+ if "filename=" in part:
103
+ return part.split("=")[1].strip().strip('"')
104
+ return default_name
105
+
106
+ def download_file(self, url: str, destination_folder: str, default_name: str) -> Optional[str]:
107
+ """指定されたURLからファイルをダウンロードし、指定されたフォルダに保存する。"""
108
+ try:
109
+ response = requests.get(url, headers=self.config.HEADERS, stream=True)
110
+ response.raise_for_status()
111
+ except requests.RequestException as e:
112
+ logger.error(f"Failed to download file from {url}: {e}")
113
+ return None
114
+
115
+ filename = self.get_filename_from_cd(response.headers.get('content-disposition'), default_name)
116
+ file_path = os.path.join(destination_folder, filename)
117
+
118
+ # ダウンロードとファイル保存処理
119
+ with open(file_path, 'wb') as file:
120
+ for chunk in response.iter_content(chunk_size=8192):
121
+ file.write(chunk)
122
+ logger.info(f"Downloaded: {file_path}")
123
+ return file_path
124
+
125
+ def get_model_info(self, model_id: str) -> dict:
126
+ """モデルの情報を取得する。"""
127
+ try:
128
+ response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
129
+ response.raise_for_status()
130
+ return response.json()
131
+ except requests.RequestException as e:
132
+ logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
133
+ return {}
134
+
135
+ def download_images(self, model_versions: list, folder: str):
136
+ """画像を images フォルダにまとめてダウンロードする."""
137
+ images_folder = os.path.join(folder, "images")
138
+ os.makedirs(images_folder, exist_ok=True)
139
+ images = []
140
+ for version in model_versions:
141
+ for img in version.get("images", []):
142
+ images.append(img["url"])
143
+ for image_url in images:
144
+ image_name = os.path.basename(image_url)
145
+ local_path = os.path.join(images_folder, image_name)
146
+ try:
147
+ resp = requests.get(image_url, stream=True)
148
+ resp.raise_for_status()
149
+ with open(local_path, 'wb') as imgf:
150
+ for chunk in resp.iter_content(chunk_size=8192):
151
+ imgf.write(chunk)
152
+ logger.info(f"Downloaded image: {local_path}")
153
+ except requests.RequestException as e:
154
+ logger.error(f"Failed to download image {image_url}: {e}")
155
+
156
+ def save_html_content(self, model_page_url: str, folder: str):
157
+ """モデルページのHTMLをフォルダ内に保存する."""
158
  try:
159
+ resp = requests.get(model_page_url)
160
+ resp.raise_for_status()
161
+ html_path = os.path.join(folder, "page.html")
162
+ with open(html_path, 'w', encoding='utf-8') as f:
163
+ f.write(resp.text)
164
+ logger.info(f"Saved HTML: {html_path}")
165
  except Exception as e:
166
+ logger.error(f"Error saving HTML content from {model_page_url}: {e}")
167
+
168
+ def save_model_info_json(self, model_info: dict, folder: str):
169
+ """モデル情報をJSONファイルとして保存."""
170
+ info_path = os.path.join(folder, "model_info.json")
171
+ try:
172
+ with open(info_path, 'w', encoding='utf-8') as f:
173
+ json.dump(model_info, f, indent=2)
174
+ logger.info(f"Saved model_info.json: {info_path}")
175
+ except Exception as e:
176
+ logger.error(f"Failed to save model info JSON: {e}")
177
+ # =========================================================================
178
+ # ここが重要:
179
+ # - 最新バージョンはまとめて folder_name にダウンロード (一度に暗号化アップロード)
180
+ # - 古いバージョンは1つずつダウンロード→暗号化→アップロード→削除 でストレージを節約
181
+ # =========================================================================
182
+ def download_and_process_versions(self, model_versions: list, folder: str):
183
+ """最新バージョンをまとめてダウンロード、old_versionsは1つずつアップして削除。"""
184
+
185
+ # 1) 最新バージョン (インデックス0) のファイルを folder にダウンロード
186
+ latest_version = model_versions[0]
187
+ logger.info(f"Processing latest version: {latest_version.get('name','(NoName)')}")
188
+
189
+ for file_info in latest_version.get("files", []):
190
+ download_url = file_info["downloadUrl"]
191
+ file_name = file_info["name"]
192
+ login_detected_count = 0
193
+
194
+ while login_detected_count < 5:
195
+ local_path = self.download_file(download_url, folder, file_name)
196
+ if local_path and "login" in os.listdir(folder):
197
+ login_detected_count += 1
198
+ os.remove(os.path.join(folder, "login"))
199
+ logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
200
+ else:
201
+ break
202
+
203
+ if login_detected_count >= 5:
204
+ dummy_file_path = os.path.join(folder, f"{file_name}.download_failed")
205
+ try:
206
+ with open(dummy_file_path, "w") as f:
207
+ f.write("Download failed after 5 attempts.")
208
+ logger.error(f"Failed to download {file_name}. Dummy file created: {dummy_file_path}")
209
+ except Exception as e:
210
+ logger.error(f"Failed to create dummy file for {file_name}: {e}")
211
+
212
+ # 2) 古いバージョンがあれば 1つずつダウンロード→暗号化アップロード→削除
213
+ if len(model_versions) > 1:
214
+ old_versions_folder = os.path.join(folder, "old_versions")
215
+ os.makedirs(old_versions_folder, exist_ok=True)
216
+
217
+ for version in model_versions[1:]:
218
+ logger.info(f"Processing older version: {version.get('name','(NoName)')}")
219
+ for file_info in version.get("files", []):
220
+ file_name = file_info["name"]
221
+ download_url = file_info["downloadUrl"]
222
+ login_detected_count = 0
223
+
224
+ while login_detected_count < 5:
225
+ local_path = self.download_file(download_url, old_versions_folder, file_name)
226
+ if local_path and "login" in os.listdir(old_versions_folder):
227
+ login_detected_count += 1
228
+ os.remove(os.path.join(old_versions_folder, "login"))
229
+ logger.warning(f"'login' file found, retrying {file_name} ({login_detected_count}/5)")
230
+ else:
231
+ break
232
+
233
+ if login_detected_count >= 5:
234
+ dummy_file_path = os.path.join(old_versions_folder, f"{file_name}.download_failed")
235
+ try:
236
+ with open(dummy_file_path, "w") as f:
237
+ f.write("Download failed after 5 attempts.")
238
+ logger.error(f"Failed to download {file_name}. Dummy file: {dummy_file_path}")
239
+ except Exception as e:
240
+ logger.error(f"Failed to create dummy file for {file_name}: {e}")
241
+ continue # 次のファイルへ
242
+
243
+ # ===== ダウンロード成功した古いバージョンファイルを暗号化アップロード =====
244
+ # フォルダごと暗号化でもいいですが、大容量を避けるためファイル単位で暗号化する例
245
+ # ここでは "encrypt_and_upload_folder" の代わりに「encrypt_and_upload_file」するなど
246
+ # あるいはフォルダごとでもOK
247
+ single_file_folder = os.path.join(old_versions_folder, "temp_single")
248
+ os.makedirs(single_file_folder, exist_ok=True)
249
+ try:
250
+ # 移動して「このファイルだけ」が入ったフォルダを作る
251
+ single_file_path = shutil.move(local_path, os.path.join(single_file_folder, file_name))
252
+ # 暗号化アップロード
253
+ self.encrypt_and_upload_folder(single_file_folder)
254
+ except Exception as e:
255
+ logger.error(f"Failed to encrypt/upload old version file: {e}")
256
+ finally:
257
+ # single_file_folder削除(encrypt_and_upload_folderで消えるはず)
258
+ if os.path.exists(single_file_folder):
259
+ shutil.rmtree(single_file_folder)
260
+ logger.info(f"Removed temp_single folder {single_file_folder}")
261
+
262
+ def encrypt_and_upload_folder(self, local_folder: str) -> Optional[str]:
263
+ """
264
+ 1. /home/user/app/encrypted 配下の古いファイルやフォルダを消去(オプション)
265
+ 2. rclone mkdir cryptLocal:subfolder_label (空ディレクトリを必ず作る)
266
+ 3. rclone copy local_folder => cryptLocal:subfolder_label
267
+ → /home/user/app/encrypted/<暗号フォルダ>/ にコピー
268
+ 4. そのフォルダを self.upload_folder() (=HFにアップロード)
269
+ 5. ローカル(平文フォルダ & 暗号フォルダ)削除
270
+ 6. 最後に subfolder_label(論理上のフォルダ名)を return
271
+ """
272
+ if not os.path.exists(local_folder):
273
+ logger.error(f"encrypt_and_upload_folder: folder not found: {local_folder}")
274
+ return None
275
+
276
+ encrypted_base_dir = os.path.join(os.getcwd(), "encrypted")
277
+ os.makedirs(encrypted_base_dir, exist_ok=True)
278
+
279
+ # 1) 古い暗号ファイルやフォルダを削除
280
+ for item in os.listdir(encrypted_base_dir):
281
+ item_path = os.path.join(encrypted_base_dir, item)
282
+ try:
283
+ if os.path.isfile(item_path) or os.path.islink(item_path):
284
+ os.remove(item_path)
285
+ else:
286
+ shutil.rmtree(item_path)
287
+ logger.info(f"[CLEANUP] Removed old encrypted item: {item_path}")
288
+ except Exception as e:
289
+ logger.warning(f"[CLEANUP] Failed to remove {item_path}: {e}")
290
+
291
+ # 2) まず mkdir で「暗号リモート上に空ディレクトリ」を作る
292
+ subfolder_label = "enc_" + str(uuid.uuid4())[:8]
293
+ try:
294
+ subprocess.run(
295
+ ["rclone", "mkdir", f"cryptLocal:{subfolder_label}"],
296
+ check=True
297
+ )
298
+ logger.info(f"[OK] rclone mkdir cryptLocal:{subfolder_label}")
299
+ except subprocess.CalledProcessError as e:
300
+ logger.error(f"rclone mkdir failed: {e}")
301
+ return None
302
+
303
+ # 3) local_folder -> cryptLocal:subfolder_label にコピー
304
+ # --create-empty-src-dirs: 空フォルダ構造も含めてコピー
305
+ try:
306
+ subprocess.run(
307
+ [
308
+ "rclone", "copy",
309
+ local_folder,
310
+ f"cryptLocal:{subfolder_label}",
311
+ "--create-empty-src-dirs"
312
+ ],
313
+ check=True
314
+ )
315
+ logger.info(f"[OK] rclone copy {local_folder} => cryptLocal:{subfolder_label}")
316
+ except subprocess.CalledProcessError as e:
317
+ logger.error(f"rclone copy failed: {e}")
318
+ return None
319
+
320
+ # cryptLocal:{subfolder_label} => 実際には /home/user/app/encrypted/<ランダム文字列> のフォルダになる
321
+ # それを差分検知で特定
322
+ before_dirs = set(os.listdir(encrypted_base_dir))
323
+ # mkdir/copy 直後に何らかのズレがあるかもしれないので再度 mkdir しなくてOK
324
+ # すでに上でやってるため、ここでやるなら:
325
+ # time.sleep(1) などで間を置いてみる手もある
326
+
327
+ after_dirs = set(os.listdir(encrypted_base_dir))
328
+ diff = after_dirs - before_dirs
329
+ # もし diff が空なら既にあるフォルダに上書きコピーされた可能性
330
+ if not diff:
331
+ logger.error("[ERROR] No new directory appeared in ./encrypted after rclone copy.")
332
+ return None
333
+ if len(diff) > 1:
334
+ logger.warning(f"[WARN] Multiple new directories found: {diff}, picking the first one.")
335
+ enc_folder_name = diff.pop()
336
+ enc_folder_path = os.path.join(encrypted_base_dir, enc_folder_name)
337
+
338
+ if not os.path.isdir(enc_folder_path):
339
+ logger.error(f"[ERROR] {enc_folder_path} is not a directory.")
340
+ return None
341
+
342
+ # 4) アップロード (フォルダとして)
343
+ try:
344
+ self.upload_folder(enc_folder_path, path_in_repo=subfolder_label)
345
+ logger.info(f"Uploaded encrypted folder: {enc_folder_path}")
346
+ except Exception as e:
347
+ logger.error(f"Failed to upload encrypted folder {enc_folder_path}: {e}")
348
+
349
+ # 5) ローカル削除
350
+ try:
351
+ shutil.rmtree(local_folder)
352
+ shutil.rmtree(enc_folder_path)
353
+ logger.info(f"Removed local folder: {local_folder} and {enc_folder_path}")
354
+ except Exception as e:
355
+ logger.error(f"Failed to remove local folders: {e}")
356
+
357
+ # 6) 論理上のフォルダ名(rclone上の名称)を返す
358
+ return subfolder_label
359
+
360
+ def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
361
+ if repo_id is None:
362
+ repo_id = self.repo_ids['current']
363
+ if path_in_repo is None:
364
+ path_in_repo = os.path.basename(file_path)
365
+
366
+ max_retries = 5
367
+ attempt = 0
368
+ while attempt < max_retries:
369
+ try:
370
+ self.api.upload_file(
371
+ path_or_fileobj=file_path,
372
+ repo_id=repo_id,
373
+ path_in_repo=path_in_repo
374
+ )
375
+ logger.info(f"Uploaded file: {file_path} to {repo_id} at {path_in_repo}")
376
+ return
377
+ except Exception as e:
378
+ attempt += 1
379
+ error_message = str(e)
380
+ if "over the limit of 100000 files" in error_message:
381
+ logger.warning("File limit exceeded, creating a new repo.")
382
+ self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
383
+ self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
384
+ attempt = 0
385
+ continue
386
+ elif "you can retry this action in about 1 hour" in error_message:
387
+ logger.warning("Rate limit hit. Waiting 1 hour...")
388
+ time.sleep(3600)
389
+ attempt -= 1
390
+ else:
391
+ if attempt < max_retries:
392
+ logger.warning(f"Failed to upload {file_path}, retry {attempt}/{max_retries}")
393
+ else:
394
+ logger.error(f"Failed after {max_retries} attempts: {e}")
395
+ raise
396
+
397
+ def upload_file(self, file_path: str, repo_id: Optional[str] = None, path_in_repo: Optional[str] = None):
398
+ if repo_id is None:
399
+ repo_id = self.repo_ids['current']
400
+ if path_in_repo is None:
401
+ path_in_repo = os.path.basename(file_path)
402
+
403
+ max_retries = 5
404
+ attempt = 0
405
+ while attempt < max_retries:
406
+ try:
407
+ self.api.upload_file(
408
+ path_or_fileobj=file_path,
409
+ repo_id=repo_id,
410
+ path_in_repo=path_in_repo
411
+ )
412
+ logger.info(f"Uploaded file: {file_path} to {repo_id} at {path_in_repo}")
413
+ return
414
+ except Exception as e:
415
+ attempt += 1
416
+ error_message = str(e)
417
+ if "over the limit of 100000 files" in error_message:
418
+ logger.warning("File limit exceeded, creating a new repo.")
419
+ self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
420
+ self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
421
+ attempt = 0
422
+ continue
423
+ elif "you can retry this action in about 1 hour" in error_message:
424
+ logger.warning("Rate limit hit. Waiting 1 hour...")
425
+ time.sleep(3600)
426
+ attempt -= 1
427
+ else:
428
+ if attempt < max_retries:
429
+ logger.warning(f"Failed to upload {file_path}, retry {attempt}/{max_retries}")
430
+ else:
431
+ logger.error(f"Failed after {max_retries} attempts: {e}")
432
+ raise
433
+
434
+ def upload_folder(self, folder_path: str, path_in_repo: Optional[str] = None):
435
+ """
436
+ フォルダを Hugging Face リポジトリに一括アップロード
437
+ """
438
+ if path_in_repo is None:
439
+ path_in_repo = os.path.basename(folder_path)
440
+
441
+ max_retries = 5
442
+ attempt = 0
443
+ while attempt < max_retries:
444
+ try:
445
+ self.api.upload_folder(
446
+ folder_path=folder_path,
447
+ repo_id=self.repo_ids['current'],
448
+ path_in_repo=path_in_repo
449
+ )
450
+ logger.info(f"Uploaded folder: {folder_path} to {self.repo_ids['current']} at {path_in_repo}")
451
+ return
452
+ except Exception as e:
453
+ attempt += 1
454
+ error_message = str(e)
455
+ if "over the limit of 100000 files" in error_message:
456
+ logger.warning("File limit exceeded, creating a new repo.")
457
+ self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
458
+ self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
459
+ attempt = 0
460
+ continue
461
+ elif "you can retry this action in about 1 hour" in error_message:
462
+ logger.warning("Rate limit hit. Waiting 1 hour...")
463
+ time.sleep(3600)
464
+ attempt -= 1
465
+ else:
466
+ if attempt < max_retries:
467
+ logger.warning(f"Failed to upload folder {folder_path}, retry {attempt}/{max_retries}")
468
+ else:
469
+ logger.error(f"Failed after {max_retries} attempts: {e}")
470
+ raise
471
+
472
+ @staticmethod
473
+ def increment_repo_name(repo_id: str) -> str:
474
+ """リポジトリ名の末尾の数字をインクリメントする。"""
475
+ match = re.search(r'(\d+)$', repo_id)
476
+ if match:
477
+ number = int(match.group(1)) + 1
478
+ new_repo_id = re.sub(r'\d+$', str(number), repo_id)
479
+ else:
480
+ new_repo_id = f"{repo_id}1"
481
+ return new_repo_id
482
+
483
+ def read_model_list(self) -> dict:
484
+ """モデルリストを読み込む。"""
485
+ model_list = {}
486
+ try:
487
+ with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
488
+ for line in f:
489
+ line = line.strip()
490
+ if line:
491
+ parts = line.split(": ", 1)
492
+ if len(parts) == 2:
493
+ modelpage_name, model_hf_url = parts
494
+ model_list[model_hf_url] = modelpage_name
495
+ except Exception as e:
496
+ logger.error(f"Failed to read model list: {e}")
497
+ return model_list
498
+
499
+ def get_repo_info(self, repo_id):
500
+ """リポジトリの情報を取得する。"""
501
+ try:
502
+ repo_info = self.api.repo_info(repo_id=repo_id, files_metadata=True)
503
+ file_paths = [sibling.rfilename for sibling in repo_info.siblings]
504
+ return file_paths
505
+ except Exception as e:
506
+ logger.error(f"Failed to get repo info for {repo_id}: {e}")
507
+ return []
508
+
509
+ def process_model(self, model_url: str):
510
+ """1つのモデルをダウンロードしてフォルダ丸ごと暗号化&アップロードする."""
511
+ try:
512
+ # model_idを取得
513
+ model_id = model_url.rstrip("/").split("/")[-1]
514
+
515
+ # モデル情報を取得
516
+ model_info = self.get_model_info(model_id)
517
+ if not model_info or "modelVersions" not in model_info:
518
+ logger.error(f"No valid model info for ID {model_id}. Skipping.")
519
+ return
520
+
521
+ # バージョン一覧
522
+ versions = model_info["modelVersions"]
523
+ if not versions:
524
+ logger.warning(f"No modelVersions found for ID {model_id}.")
525
+ return
526
+
527
+ # ベースとなるフォルダ名
528
+ folder_name = model_info.get("name", "UnknownModel")
529
+ folder_name = re.sub(r'[\\/*?:"<>|]', '_', folder_name)
530
+ folder_name += "_" + str(uuid.uuid4())[:8]
531
+ os.makedirs(folder_name, exist_ok=True)
532
+
533
+ # (A) 最新バージョンファイル + (B) 画像 + (C) HTML + model_info.json
534
+ # → 全部 folder_name に置く
535
+ # → 後でまとめて encrypt_and_upload_folder
536
+ self.download_and_process_versions(versions, folder_name)
537
+ self.download_images(versions, folder_name)
538
+ self.save_html_content(f"{self.config.URLS['modelPage']}{model_id}", folder_name)
539
+ self.save_model_info_json(model_info, folder_name)
540
+
541
+ # ↑ この時点で「最新バージョン」の大ファイルが folder_name に残っている
542
+ # しかし old_versions は既に1つずつ暗号化→削除済み
543
+
544
+ # まとめて暗号化 & アップロード
545
+ enc_subfolder = self.encrypt_and_upload_folder(folder_name)
546
+ if enc_subfolder is None:
547
+ enc_subfolder = "[ENCRYPT_FAILED]"
548
+
549
+ hf_enc_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{enc_subfolder}"
550
+ with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
551
+ f.write(f"{model_info.get('name', 'UnnamedModel')} (ID:{model_id}): {hf_enc_url}\n")
552
+
553
+ except Exception as e:
554
+ logger.error(f"Error in process_model ({model_url}): {e}")
555
+
556
+ async def crawl(self):
557
+ """モデルを定期的にチェックし、更新を行う。"""
558
+ while True:
559
+ try:
560
+ login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
561
+
562
+ # model_list.logを最新化
563
+ model_list_path = hf_hub_download(
564
+ repo_id=self.repo_ids['model_list'],
565
+ filename=self.config.LIST_FILE
566
+ )
567
+ shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
568
+
569
+ # ログファイルを最新化
570
+ local_file_path = hf_hub_download(
571
+ repo_id=self.repo_ids["log"],
572
+ filename=self.config.LOG_FILE
573
+ )
574
+ shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
575
+
576
+ # ログを読み込み
577
+ with open(self.config.LOG_FILE, "r", encoding="utf-8") as file:
578
+ lines = file.read().splitlines()
579
+ old_models = json.loads(lines[0]) if len(lines) > 0 else []
580
+ self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
581
+
582
+ # 新着モデル確認
583
+ r = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
584
+ r.raise_for_status()
585
+ latest_models = r.json().get("items", [])
586
+ latest_model_ids = [m["id"] for m in latest_models if "id" in m]
587
+
588
+ new_models = list(set(latest_model_ids) - set(old_models))
589
+ if new_models:
590
+ logger.info(f"New model IDs found: {new_models}")
591
+ model_id = new_models[0]
592
+
593
+ for attempt in range(1, 6):
594
+ try:
595
+ self.process_model(self.config.URLS["modelId"] + str(model_id))
596
+ break
597
+ except Exception as e:
598
+ logger.error(f"Failed to process model {model_id} (attempt {attempt}/5): {e}")
599
+ if attempt == 5:
600
+ logger.error(f"Skipping model {model_id} after 5 failures.")
601
+ else:
602
+ await asyncio.sleep(2)
603
+
604
+ # 新モデルをold_modelsに追加し、ログを更新
605
+ old_models.append(model_id)
606
+ with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
607
+ f.write(json.dumps(old_models) + "\n")
608
+ f.write(f"{self.repo_ids['current']}\n")
609
+ logger.info(f"Updated log with new model ID: {model_id}")
610
+
611
+ # ログファイル & model_list.logをアップロード
612
+ self.upload_file(
613
+ file_path=self.config.LOG_FILE,
614
+ repo_id=self.repo_ids["log"],
615
+ path_in_repo=self.config.LOG_FILE
616
+ )
617
+ self.upload_file(
618
+ file_path=self.config.LIST_FILE,
619
+ repo_id=self.repo_ids["model_list"],
620
+ path_in_repo=self.config.LIST_FILE
621
+ )
622
+ else:
623
+ # 新着なし → ログを最新化してアップロードだけして待機
624
+ with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
625
+ f.write(json.dumps(latest_model_ids) + "\n")
626
+ f.write(f"{self.repo_ids['current']}\n")
627
+ logger.info(f"No new models. Updated log: {self.config.LOG_FILE}")
628
+ self.upload_file(
629
+ file_path=self.config.LOG_FILE,
630
+ repo_id=self.repo_ids["log"],
631
+ path_in_repo=self.config.LOG_FILE
632
+ )
633
+ logger.info("Uploaded log file.")
634
+ await asyncio.sleep(60)
635
+ continue
636
+
637
+ except Exception as e:
638
+ logger.error(f"Error in crawl loop: {e}")
639
+ await asyncio.sleep(300)
640
+
641
+
642
+ # FastAPIアプリケーション
643
+ config = Config()
644
+ crawler = CivitAICrawler(config)
645
+ app = crawler.app