Spaces:

ttttdiva
/

cv_test

Running

App Files Files Community

ttttdiva commited on Jan 6

Commit

b6e4f0f

verified ·

1 Parent(s): 8a73292

Update main.py

Browse files

Files changed (1) hide show

main.py +45 -79

main.py CHANGED Viewed

@@ -114,7 +114,6 @@ class CivitAICrawler:
         if os.path.isdir(self.config.ENCRYPTED_DIR):
             shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
-        # コピー先: cryptLocal:{basename(local_path)}
         top_level_name = os.path.basename(local_path.rstrip("/"))
         if not top_level_name:
             top_level_name = "unnamed"
@@ -161,7 +160,6 @@ class CivitAICrawler:
                             logger.warning("Repository file limit exceeded. Creating a new repository...")
                             self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
                             self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
-                            # リポジトリが変わったので attempt をリセット
                             attempt = 0
                             repo_id = self.repo_ids['current']
                             continue
@@ -180,13 +178,8 @@ class CivitAICrawler:
                                 )
                                 raise
-    # =============================
-    # ここから既存処理
-    # =============================
     @staticmethod
     def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
-        """Content-Dispositionヘッダーからファイル名を取得する。"""
         if content_disposition:
             parts = content_disposition.split(';')
             for part in parts:
@@ -195,7 +188,6 @@ class CivitAICrawler:
         return default_name
     def download_file(self, url: str, destination_folder: str, default_name: str):
-        """指定されたURLからファイルをダウンロードし、指定されたフォルダに保存する。"""
         try:
             response = requests.get(url, headers=self.config.HEADERS, stream=True)
             response.raise_for_status()
@@ -212,7 +204,6 @@ class CivitAICrawler:
         logger.info(f"Download completed: {file_path}")
     def get_model_info(self, model_id: str) -> dict:
-        """モデルの情報を取得する。"""
         try:
             response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
             response.raise_for_status()
@@ -221,7 +212,6 @@ class CivitAICrawler:
             logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
     def download_model(self, model_versions: list, folder: str, existing_old_version_files: list = []):
-        """モデルのバージョンをダウンロードする。"""
         latest_version = model_versions[0]
         latest_files = latest_version["files"]
         for file_info in latest_files:
@@ -294,10 +284,8 @@ class CivitAICrawler:
                             logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
                         except Exception as e:
                             logger.error(f"Failed to create dummy file for {file_name}: {e}")
-                        continue
     def download_images(self, model_versions: list, folder: str):
-        """モデルの画像をダウンロードし、指定されたフォルダに保存する。"""
         images_folder = os.path.join(folder, "images")
         os.makedirs(images_folder, exist_ok=True)
@@ -317,7 +305,6 @@ class CivitAICrawler:
             except requests.RequestException as e:
                 logger.error(f"Error downloading image {image_url}: {e}")
-        # 画像フォルダをパスワード付きZIP
         try:
             original_cwd = os.getcwd()
             os.chdir(folder)
@@ -332,7 +319,6 @@ class CivitAICrawler:
             shutil.rmtree(images_folder)
     def save_html_content(self, url: str, folder: str):
-        """指定されたURLからHTMLコンテンツを取得し、保存する。"""
         try:
             response = requests.get(url)
             response.raise_for_status()
@@ -344,7 +330,6 @@ class CivitAICrawler:
     @staticmethod
     def save_model_info(model_info: dict, folder: str):
-        """モデル情報(json)の保存"""
         with open(os.path.join(folder, "model_info.json"), "w") as file:
             json.dump(model_info, file, indent=2)
@@ -358,7 +343,7 @@ class CivitAICrawler:
             return f"{repo_id}1"
     # =============================================================================
-    # ★ 暗号化しないアップロード（ログや model_list.log 用）
     # =============================================================================
     def upload_file_raw(
         self,
@@ -366,10 +351,6 @@ class CivitAICrawler:
         repo_id: Optional[str] = None,
         path_in_repo: Optional[str] = None
     ):
-        """
-        暗号化せず、そのまま Hugging Face にアップロードするメソッド。
-        civitai_backup.log や model_list.log などはこれを使う。
-        """
         if repo_id is None:
             repo_id = self.repo_ids['current']
         if path_in_repo is None:
@@ -393,7 +374,6 @@ class CivitAICrawler:
                     logger.warning("Repository file limit exceeded, creating a new repository.")
                     self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
                     self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
-                    # リポジトリ変更で attempt リセット
                     attempt = 0
                     repo_id = self.repo_ids['current']
                     continue
@@ -409,7 +389,7 @@ class CivitAICrawler:
                         raise
     # =============================================================================
-    # ★ 暗号化してアップロード (単ファイル)
     # =============================================================================
     def upload_file_encrypted(
         self,
@@ -417,25 +397,18 @@ class CivitAICrawler:
         repo_id: Optional[str] = None,
         path_in_repo: Optional[str] = None
     ):
-        """
-        単一ファイルを rclone で暗号化し、そのままHFへアップロード。
-        """
         if repo_id is None:
             repo_id = self.repo_ids['current']
-        base_path = path_in_repo or ""  # HF上のベースパス
-        # 1) rclone で暗号化
         self.encrypt_with_rclone(file_path)
-        # 2) アップロード
         self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
-        # 3) 後始末
         if os.path.isdir(self.config.ENCRYPTED_DIR):
             shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
     # =============================================================================
-    # ★ 暗号化してアップロード (フォルダ)
     # =============================================================================
     def upload_folder_encrypted(
         self,
@@ -443,20 +416,12 @@ class CivitAICrawler:
         repo_id: Optional[str] = None,
         path_in_repo: Optional[str] = None
     ) -> str:
-        """
-        フォルダを rclone で暗号化し、暗号化されたフォルダ構造ごとアップロード。
-        終了後に「実際に Hugging Face 上で使われる暗号化後のトップレベルフォルダ名」を返す。
-        """
         if repo_id is None:
             repo_id = self.repo_ids['current']
         base_path = path_in_repo or ""
-        # 1) rcloneにコピーして暗号化
         self.encrypt_with_rclone(folder_path)
-        # 2) 暗号後のトップディレクトリ名を取得
-        #    例: /home/user/app/encrypted/<暗号フォルダ名>
-        #    基本的にトップレベルディレクトリは1つ想定
         top_levels = [
             d for d in os.listdir(self.config.ENCRYPTED_DIR)
             if os.path.isdir(os.path.join(self.config.ENCRYPTED_DIR, d))
@@ -468,49 +433,49 @@ class CivitAICrawler:
         encrypted_top_name = top_levels[0]
-        # 3) アップロード
         self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
-        # 4) 後始末
         if os.path.isdir(self.config.ENCRYPTED_DIR):
             shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
-        # 5) 実際にHFに作られた「暗号化後のトップフォルダ名」を返す
         return encrypted_top_name
     def read_model_list(self):
-        """モデルリストを読み込む。"""
         model_list = {}
         try:
             with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
                 for line in f:
                     line = line.strip()
-                    if line:
-                        parts = line.split(": ", 1)
-                        if len(parts) == 2:
-                            modelpage_name, model_hf_url = parts
-                            model_list[model_hf_url] = modelpage_name
             return model_list
         except Exception as e:
             logger.error(f"Failed to read model list: {e}")
             return {}
-    def get_repo_info(self, repo_id):
-        """リポジトリの情報を取得する。"""
-        try:
-            repo_info = self.api.repo_info(repo_id=repo_id, files_metadata=True)
-            file_paths = [sibling.rfilename for sibling in repo_info.siblings]
-            return file_paths
-        except Exception as e:
-            logger.error(f"Failed to get repo info for {repo_id}: {e}")
-            return []
     def process_model(self, model_url: str):
         """指定されたモデルURLを処理する関数。"""
         try:
             model_id = model_url.rstrip("/").split("/")[-1]
             model_info = self.get_model_info(model_id)
             latest_version = model_info.get("modelVersions", [])[0]
             model_file = next(
                 (file for file in latest_version["files"] if file.get('type') == 'Model'),
@@ -520,6 +485,7 @@ class CivitAICrawler:
                 latest_filename = model_file['name']
                 folder = os.path.splitext(latest_filename)[0]
             else:
                 first_file = latest_version["files"][0]
                 latest_filename = first_file['name']
                 folder = os.path.splitext(latest_filename)[0]
@@ -527,27 +493,28 @@ class CivitAICrawler:
             os.makedirs(folder, exist_ok=True)
-            model_list = self.read_model_list()
-            existing_old_version_files = []
-            self.download_model(model_info["modelVersions"], folder, existing_old_version_files)
             self.download_images(model_info["modelVersions"], folder)
             self.save_html_content(model_url, folder)
             self.save_model_info(model_info, folder)
-            # ========== rclone で暗号化フォルダをアップロード ==========
             encrypted_top_name = self.upload_folder_encrypted(folder)
-            # モデルリスト更新
-            modelpage_name = model_info.get("name", "Unnamed Model")
-            # 暗号化されたフォルダを使った有効なURL
-            model_hf_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{encrypted_top_name}"
             with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
-                f.write(f"{modelpage_name}: {model_hf_url}\n")
-            # ローカルフォルダを削除
             if os.path.exists(folder):
                 shutil.rmtree(folder)
@@ -560,11 +527,10 @@ class CivitAICrawler:
             try:
                 login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
-                # model_list.logのダウンロード（暗号化せず上書き）
                 model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
                 shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
-                # ログファイルのダウンロード（暗号化せず上書き）
                 local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
                 shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
@@ -574,18 +540,19 @@ class CivitAICrawler:
                     old_models = json.loads(lines[0]) if len(lines) > 0 else []
                     self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
-                # 新着モデルの取得
                 response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
                 response.raise_for_status()
                 latest_models = response.json().get("items", [])
                 latest_model_ids = [item.get("id") for item in latest_models if "id" in item]
-                # 増分の確認
                 new_models = list(set(latest_model_ids) - set(old_models))
                 if new_models:
                     logger.info(f"New models found: {new_models}")
                     model_id = new_models[0]
                     for attempt in range(1, 6):
                         try:
                             self.process_model(f"{self.config.URLS['modelId']}{model_id}")
@@ -597,13 +564,12 @@ class CivitAICrawler:
                             else:
                                 await asyncio.sleep(2)
                 else:
-                    # 新モデルなし → ログ更新して終了
                     with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
                         f.write(json.dumps(latest_model_ids) + "\n")
                         f.write(f"{self.repo_ids['current']}\n")
                     logger.info(f"Updated log file: {self.config.LOG_FILE}")
-                    # ログファイルをリポジトリにアップロード（暗号化しない）
                     self.upload_file_raw(
                         file_path=self.config.LOG_FILE,
                         repo_id=self.repo_ids["log"],
@@ -615,16 +581,16 @@ class CivitAICrawler:
                     await asyncio.sleep(60)
                     continue
-                # 古いモデルリストに追加
                 old_models.append(model_id)
-                # ログファイルの更新
                 with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
                     f.write(json.dumps(old_models) + "\n")
                     f.write(f"{self.repo_ids['current']}\n")
                 logger.info(f"Updated log file with new model ID: {model_id}")
-                # ログとモデルリストのアップロード（暗号化しない）
                 self.upload_file_raw(
                     file_path=self.config.LOG_FILE,
                     repo_id=self.repo_ids["log"],
@@ -641,7 +607,7 @@ class CivitAICrawler:
                 await asyncio.sleep(300)
-# モジュールレベルでFastAPIのアプリケーションを公開
 config = Config()
 crawler = CivitAICrawler(config)
 app = crawler.app

         if os.path.isdir(self.config.ENCRYPTED_DIR):
             shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
         top_level_name = os.path.basename(local_path.rstrip("/"))
         if not top_level_name:
             top_level_name = "unnamed"
                             logger.warning("Repository file limit exceeded. Creating a new repository...")
                             self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
                             self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
                             attempt = 0
                             repo_id = self.repo_ids['current']
                             continue
                                 )
                                 raise
     @staticmethod
     def get_filename_from_cd(content_disposition: Optional[str], default_name: str) -> str:
         if content_disposition:
             parts = content_disposition.split(';')
             for part in parts:
         return default_name
     def download_file(self, url: str, destination_folder: str, default_name: str):
         try:
             response = requests.get(url, headers=self.config.HEADERS, stream=True)
             response.raise_for_status()
         logger.info(f"Download completed: {file_path}")
     def get_model_info(self, model_id: str) -> dict:
         try:
             response = requests.get(self.config.URLS["modelId"] + str(model_id), headers=self.config.HEADERS)
             response.raise_for_status()
             logger.error(f"Failed to retrieve model info for ID {model_id}: {e}")
     def download_model(self, model_versions: list, folder: str, existing_old_version_files: list = []):
         latest_version = model_versions[0]
         latest_files = latest_version["files"]
         for file_info in latest_files:
                             logger.error(f"Failed to download {file_name}. Created dummy file {dummy_file_name}. URL: {download_url}")
                         except Exception as e:
                             logger.error(f"Failed to create dummy file for {file_name}: {e}")
     def download_images(self, model_versions: list, folder: str):
         images_folder = os.path.join(folder, "images")
         os.makedirs(images_folder, exist_ok=True)
             except requests.RequestException as e:
                 logger.error(f"Error downloading image {image_url}: {e}")
         try:
             original_cwd = os.getcwd()
             os.chdir(folder)
             shutil.rmtree(images_folder)
     def save_html_content(self, url: str, folder: str):
         try:
             response = requests.get(url)
             response.raise_for_status()
     @staticmethod
     def save_model_info(model_info: dict, folder: str):
         with open(os.path.join(folder, "model_info.json"), "w") as file:
             json.dump(model_info, file, indent=2)
             return f"{repo_id}1"
     # =============================================================================
+    # 暗号化しないアップロード（ログや model_list.log 用）
     # =============================================================================
     def upload_file_raw(
         self,
         repo_id: Optional[str] = None,
         path_in_repo: Optional[str] = None
     ):
         if repo_id is None:
             repo_id = self.repo_ids['current']
         if path_in_repo is None:
                     logger.warning("Repository file limit exceeded, creating a new repository.")
                     self.repo_ids['current'] = self.increment_repo_name(self.repo_ids['current'])
                     self.api.create_repo(repo_id=self.repo_ids['current'], private=True)
                     attempt = 0
                     repo_id = self.repo_ids['current']
                     continue
                         raise
     # =============================================================================
+    # 暗号化してアップロード (単ファイル)
     # =============================================================================
     def upload_file_encrypted(
         self,
         repo_id: Optional[str] = None,
         path_in_repo: Optional[str] = None
     ):
         if repo_id is None:
             repo_id = self.repo_ids['current']
+        base_path = path_in_repo or ""
         self.encrypt_with_rclone(file_path)
         self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
         if os.path.isdir(self.config.ENCRYPTED_DIR):
             shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
     # =============================================================================
+    # 暗号化してアップロード (フォルダ)
     # =============================================================================
     def upload_folder_encrypted(
         self,
         repo_id: Optional[str] = None,
         path_in_repo: Optional[str] = None
     ) -> str:
         if repo_id is None:
             repo_id = self.repo_ids['current']
         base_path = path_in_repo or ""
         self.encrypt_with_rclone(folder_path)
         top_levels = [
             d for d in os.listdir(self.config.ENCRYPTED_DIR)
             if os.path.isdir(os.path.join(self.config.ENCRYPTED_DIR, d))
         encrypted_top_name = top_levels[0]
         self.upload_encrypted_files(repo_id=repo_id, base_path_in_repo=base_path)
         if os.path.isdir(self.config.ENCRYPTED_DIR):
             shutil.rmtree(self.config.ENCRYPTED_DIR, ignore_errors=True)
         return encrypted_top_name
+    # =============================================================================
+    # model_list.log の読み書きを「model_id: model_hf_url」で扱うよう変更
+    # =============================================================================
     def read_model_list(self):
+        """
+        model_list.log の各行を
+          "123456: https://huggingface.co/...encrypted_folder_name"
+        の形式で読み込み、 { "123456": "https://huggingface.co/..."} の dict を返す
+        """
         model_list = {}
         try:
             with open(self.config.LIST_FILE, "r", encoding="utf-8") as f:
                 for line in f:
                     line = line.strip()
+                    if not line:
+                        continue
+                    parts = line.split(": ", 1)
+                    if len(parts) == 2:
+                        stored_id, stored_url = parts
+                        model_list[stored_id] = stored_url
             return model_list
         except Exception as e:
             logger.error(f"Failed to read model list: {e}")
             return {}
     def process_model(self, model_url: str):
         """指定されたモデルURLを処理する関数。"""
         try:
+            # ===== 1) モデルID取得 & モデル情報 =====
             model_id = model_url.rstrip("/").split("/")[-1]
             model_info = self.get_model_info(model_id)
+            if not model_info:
+                logger.error(f"No model info found for ID {model_id}")
+                return
+            # ===== 2) フォルダ名を決める =====
             latest_version = model_info.get("modelVersions", [])[0]
             model_file = next(
                 (file for file in latest_version["files"] if file.get('type') == 'Model'),
                 latest_filename = model_file['name']
                 folder = os.path.splitext(latest_filename)[0]
             else:
+                # 'Model'タイプファイルが無い場合
                 first_file = latest_version["files"][0]
                 latest_filename = first_file['name']
                 folder = os.path.splitext(latest_filename)[0]
             os.makedirs(folder, exist_ok=True)
+            # ===== 3) model_list.log を読んで「既に同IDがあるかチェック」=====
+            current_list = self.read_model_list()
+            if str(model_id) in current_list:
+                logger.info(f"Model ID {model_id} is already in model_list.log. (No skip in this example)")
+            # ===== 4) ダウンロード処理 =====
+            self.download_model(model_info["modelVersions"], folder)
             self.download_images(model_info["modelVersions"], folder)
             self.save_html_content(model_url, folder)
             self.save_model_info(model_info, folder)
+            # ===== 5) rclone で暗号化フォルダをアップロード =====
             encrypted_top_name = self.upload_folder_encrypted(folder)
+            # ===== 6) model_list.log に "{model_id}: {URL}" 形式で追記 =====
+            # 暗号化されたトップフォルダ名をURLに含める
+            final_url = f"https://huggingface.co/{self.repo_ids['current']}/tree/main/{encrypted_top_name}"
             with open(self.config.LIST_FILE, "a", encoding="utf-8") as f:
+                f.write(f"{model_id}: {final_url}\n")
+            # ===== 7) ローカルフォルダ掃除 =====
             if os.path.exists(folder):
                 shutil.rmtree(folder)
             try:
                 login(token=self.config.HUGGINGFACE_API_KEY, add_to_git_credential=True)
+                # model_list.log & civitai_backup.log を取得
                 model_list_path = hf_hub_download(repo_id=self.repo_ids['model_list'], filename=self.config.LIST_FILE)
                 shutil.copyfile(model_list_path, f"./{self.config.LIST_FILE}")
                 local_file_path = hf_hub_download(repo_id=self.repo_ids["log"], filename=self.config.LOG_FILE)
                 shutil.copyfile(local_file_path, f"./{self.config.LOG_FILE}")
                     old_models = json.loads(lines[0]) if len(lines) > 0 else []
                     self.repo_ids["current"] = lines[1] if len(lines) > 1 else ""
+                # 新着モデル確認
                 response = requests.get(self.config.URLS["latest"], headers=self.config.HEADERS)
                 response.raise_for_status()
                 latest_models = response.json().get("items", [])
                 latest_model_ids = [item.get("id") for item in latest_models if "id" in item]
+                # 増分チェック
                 new_models = list(set(latest_model_ids) - set(old_models))
                 if new_models:
                     logger.info(f"New models found: {new_models}")
                     model_id = new_models[0]
                     for attempt in range(1, 6):
                         try:
                             self.process_model(f"{self.config.URLS['modelId']}{model_id}")
                             else:
                                 await asyncio.sleep(2)
                 else:
+                    # 新モデルなし
                     with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
                         f.write(json.dumps(latest_model_ids) + "\n")
                         f.write(f"{self.repo_ids['current']}\n")
                     logger.info(f"Updated log file: {self.config.LOG_FILE}")
                     self.upload_file_raw(
                         file_path=self.config.LOG_FILE,
                         repo_id=self.repo_ids["log"],
                     await asyncio.sleep(60)
                     continue
+                # 追加したモデルIDを old_models に追加
                 old_models.append(model_id)
+                # ログファイル更新
                 with open(self.config.LOG_FILE, "w", encoding="utf-8") as f:
                     f.write(json.dumps(old_models) + "\n")
                     f.write(f"{self.repo_ids['current']}\n")
                 logger.info(f"Updated log file with new model ID: {model_id}")
+                # ログとmodel_list.logをアップロード
                 self.upload_file_raw(
                     file_path=self.config.LOG_FILE,
                     repo_id=self.repo_ids["log"],
                 await asyncio.sleep(300)
+# 実行
 config = Config()
 crawler = CivitAICrawler(config)
 app = crawler.app