Spaces:

poltextlab
/

babel_machine

Running

App Files Files Community

kovacsvi commited on Jun 11

Commit

0c08f54

1 Parent(s): 55f07e5

pep8

Browse files

Files changed (1) hide show

utils.py +50 -36

utils.py CHANGED Viewed

@@ -18,7 +18,9 @@ from interfaces.illframes import domains as domains_illframes
 from interfaces.cap import build_huggingface_path as hf_cap_path
 from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
 from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
-from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
 from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
 from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
 from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
@@ -35,14 +37,21 @@ JIT_DIR = "/data/jit_models"
 HF_TOKEN = os.environ["hf_read"]
 # should be a temporary solution
-models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_cap_minor_path("", "social"), hf_ontolisst_path("")]
 # it gets more difficult with cap
 domains_cap = list(domains_cap.values())
 for language in languages_cap:
     for domain in domains_cap:
         models.append(hf_cap_path(language, domain))
 # cap media
 models.append(hf_cap_media_path("", ""))
@@ -51,36 +60,35 @@ models.append(hf_cap_media2_path("", ""))
 # cap minor media
 models.append(hf_cap_minor_media_path("", "", False))
 # emotion9
 for language in languages_emotion9:
     models.append(hf_emotion9_path(language))
 # illframes (domains is a dict for some reason?)
 for domain in domains_illframes.values():
     models.append(hf_illframes_path(domain))
 tokenizers = ["xlm-roberta-large"]
 def download_hf_models():
     os.makedirs(JIT_DIR, exist_ok=True)
     for model_id in models:
         print(f"Downloading + JIT tracing model: {model_id}")
         safe_model_name = model_id.replace("/", "_")
         traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
         if os.path.exists(traced_model_path):
             delete_unused_bin_files(model_id)
             print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
         else:
             print(f"⚙️  Tracing and saving: {traced_model_path}")
             model = AutoModelForSequenceClassification.from_pretrained(
-                model_id,
-                token=HF_TOKEN,
-                device_map="auto"
             )
             tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
@@ -92,36 +100,39 @@ def download_hf_models():
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
-                max_length=64
             )
             # JIT trace
             traced_model = torch.jit.trace(
                 model,
                 (dummy_input["input_ids"], dummy_input["attention_mask"]),
-                strict=False
             )
             # Save traced model
             traced_model.save(traced_model_path)
             print(f"✔️ Saved JIT model to: {traced_model_path}")
 def df_h():
     df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
     print("=== Disk Free Space (df -H) ===")
     print(df_result.stdout)
-    du_result = subprocess.run(["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True)
     print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
     print(du_result.stdout)
 def delete_unused_bin_files(model_id: str):
     target_path = f"/data/models--poltextlab--{model_id}"
     # delete files in blobs/
     blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
     # delete .bin files in snapshots/, except config.json
     snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)
@@ -136,16 +147,16 @@ def delete_unused_bin_files(model_id: str):
         elif os.path.isdir(path):
             print(f"Deleting directory: {path}")
             shutil.rmtree(path)
 def delete_http_folders():
     http_folders = glob.glob("/data/http*")
     for folder in http_folders:
         if os.path.isdir(folder):
             print(f"Deleting: {folder}")
             shutil.rmtree(folder)
 @contextmanager
 def hf_cleanup():
     delete_http_folders()
@@ -153,13 +164,15 @@ def hf_cleanup():
         yield
     finally:
         delete_http_folders()
 def scan_cache():
     # Scan Hugging Face model cache
-    cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
     scan_result = scan_cache_dir(cache_dir)
     print("=== 🤗 Hugging Face Model Cache ===")
     print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
     print(f"Number of repos: {len(scan_result.repos)}")
@@ -178,16 +191,17 @@ def scan_cache():
             size = os.path.getsize(path)
             total_size += size
             print(f"- {filename}: {size / 1e6:.2f} MB")
     print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
-def set_hf_cache_dir(path:str):
-    os.environ['TRANSFORMERS_CACHE'] = path
-    os.environ['HF_HOME'] = path
-    os.environ['HF_DATASETS_CACHE'] = path
-    os.environ['TORCH_HOME'] = path
 def set_torch_threads():
     torch.set_num_threads(1)
     os.environ["OMP_NUM_THREADS"] = "1"
@@ -196,8 +210,8 @@ def set_torch_threads():
 def is_disk_full(min_free_space_in_GB=10):
     total, used, free = shutil.disk_usage("/")
-    free_gb = free / (1024 ** 3)
     if free_gb >= min_free_space_in_GB:
         return False
     else:

 from interfaces.cap import build_huggingface_path as hf_cap_path
 from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
 from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
+from interfaces.cap_media_demo import (
+    build_huggingface_path as hf_cap_media_path,
+)  # why... just follow the name template the next time pls
 from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
 from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
 from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
 HF_TOKEN = os.environ["hf_read"]
 # should be a temporary solution
+models = [
+    hf_manifesto_path(""),
+    hf_sentiment_path(""),
+    hf_emotion_path(""),
+    hf_cap_minor_path("", ""),
+    hf_cap_minor_path("", "social"),
+    hf_ontolisst_path(""),
+]
 # it gets more difficult with cap
 domains_cap = list(domains_cap.values())
 for language in languages_cap:
     for domain in domains_cap:
         models.append(hf_cap_path(language, domain))
 # cap media
 models.append(hf_cap_media_path("", ""))
 # cap minor media
 models.append(hf_cap_minor_media_path("", "", False))
 # emotion9
 for language in languages_emotion9:
     models.append(hf_emotion9_path(language))
 # illframes (domains is a dict for some reason?)
 for domain in domains_illframes.values():
     models.append(hf_illframes_path(domain))
 tokenizers = ["xlm-roberta-large"]
 def download_hf_models():
     os.makedirs(JIT_DIR, exist_ok=True)
     for model_id in models:
         print(f"Downloading + JIT tracing model: {model_id}")
         safe_model_name = model_id.replace("/", "_")
         traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
         if os.path.exists(traced_model_path):
             delete_unused_bin_files(model_id)
             print(f"⏩ Skipping JIT — already exists: {traced_model_path}")
         else:
             print(f"⚙️  Tracing and saving: {traced_model_path}")
             model = AutoModelForSequenceClassification.from_pretrained(
+                model_id, token=HF_TOKEN, device_map="auto"
             )
             tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
                 return_tensors="pt",
                 padding=True,
                 truncation=True,
+                max_length=64,
             )
             # JIT trace
             traced_model = torch.jit.trace(
                 model,
                 (dummy_input["input_ids"], dummy_input["attention_mask"]),
+                strict=False,
             )
             # Save traced model
             traced_model.save(traced_model_path)
             print(f"✔️ Saved JIT model to: {traced_model_path}")
 def df_h():
     df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
     print("=== Disk Free Space (df -H) ===")
     print(df_result.stdout)
+    du_result = subprocess.run(
+        ["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True
+    )
     print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
     print(du_result.stdout)
 def delete_unused_bin_files(model_id: str):
     target_path = f"/data/models--poltextlab--{model_id}"
     # delete files in blobs/
     blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
     # delete .bin files in snapshots/, except config.json
     snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)
         elif os.path.isdir(path):
             print(f"Deleting directory: {path}")
             shutil.rmtree(path)
 def delete_http_folders():
     http_folders = glob.glob("/data/http*")
     for folder in http_folders:
         if os.path.isdir(folder):
             print(f"Deleting: {folder}")
             shutil.rmtree(folder)
 @contextmanager
 def hf_cleanup():
     delete_http_folders()
         yield
     finally:
         delete_http_folders()
 def scan_cache():
     # Scan Hugging Face model cache
+    cache_dir = os.environ.get(
+        "TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")
+    )
     scan_result = scan_cache_dir(cache_dir)
     print("=== 🤗 Hugging Face Model Cache ===")
     print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
     print(f"Number of repos: {len(scan_result.repos)}")
             size = os.path.getsize(path)
             total_size += size
             print(f"- {filename}: {size / 1e6:.2f} MB")
     print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
+def set_hf_cache_dir(path: str):
+    os.environ["TRANSFORMERS_CACHE"] = path
+    os.environ["HF_HOME"] = path
+    os.environ["HF_DATASETS_CACHE"] = path
+    os.environ["TORCH_HOME"] = path
 def set_torch_threads():
     torch.set_num_threads(1)
     os.environ["OMP_NUM_THREADS"] = "1"
 def is_disk_full(min_free_space_in_GB=10):
     total, used, free = shutil.disk_usage("/")
+    free_gb = free / (1024**3)
     if free_gb >= min_free_space_in_GB:
         return False
     else: