kovacsvi commited on
Commit
0c08f54
Β·
1 Parent(s): 55f07e5
Files changed (1) hide show
  1. utils.py +50 -36
utils.py CHANGED
@@ -18,7 +18,9 @@ from interfaces.illframes import domains as domains_illframes
18
  from interfaces.cap import build_huggingface_path as hf_cap_path
19
  from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
20
  from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
21
- from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
 
 
22
  from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
23
  from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
24
  from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
@@ -35,14 +37,21 @@ JIT_DIR = "/data/jit_models"
35
  HF_TOKEN = os.environ["hf_read"]
36
 
37
  # should be a temporary solution
38
- models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_cap_minor_path("", "social"), hf_ontolisst_path("")]
 
 
 
 
 
 
 
39
 
40
  # it gets more difficult with cap
41
  domains_cap = list(domains_cap.values())
42
  for language in languages_cap:
43
  for domain in domains_cap:
44
  models.append(hf_cap_path(language, domain))
45
-
46
  # cap media
47
  models.append(hf_cap_media_path("", ""))
48
 
@@ -51,36 +60,35 @@ models.append(hf_cap_media2_path("", ""))
51
 
52
  # cap minor media
53
  models.append(hf_cap_minor_media_path("", "", False))
54
-
55
  # emotion9
56
  for language in languages_emotion9:
57
  models.append(hf_emotion9_path(language))
58
-
59
  # illframes (domains is a dict for some reason?)
60
  for domain in domains_illframes.values():
61
  models.append(hf_illframes_path(domain))
62
 
63
  tokenizers = ["xlm-roberta-large"]
64
 
 
65
  def download_hf_models():
66
  os.makedirs(JIT_DIR, exist_ok=True)
67
 
68
  for model_id in models:
69
  print(f"Downloading + JIT tracing model: {model_id}")
70
-
71
  safe_model_name = model_id.replace("/", "_")
72
  traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
73
-
74
  if os.path.exists(traced_model_path):
75
  delete_unused_bin_files(model_id)
76
  print(f"⏩ Skipping JIT β€” already exists: {traced_model_path}")
77
  else:
78
  print(f"βš™οΈ Tracing and saving: {traced_model_path}")
79
-
80
  model = AutoModelForSequenceClassification.from_pretrained(
81
- model_id,
82
- token=HF_TOKEN,
83
- device_map="auto"
84
  )
85
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
86
 
@@ -92,36 +100,39 @@ def download_hf_models():
92
  return_tensors="pt",
93
  padding=True,
94
  truncation=True,
95
- max_length=64
96
  )
97
 
98
  # JIT trace
99
  traced_model = torch.jit.trace(
100
  model,
101
  (dummy_input["input_ids"], dummy_input["attention_mask"]),
102
- strict=False
103
  )
104
 
105
  # Save traced model
106
  traced_model.save(traced_model_path)
107
  print(f"βœ”οΈ Saved JIT model to: {traced_model_path}")
108
-
 
109
  def df_h():
110
  df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
111
  print("=== Disk Free Space (df -H) ===")
112
  print(df_result.stdout)
113
 
114
- du_result = subprocess.run(["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True)
 
 
115
  print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
116
  print(du_result.stdout)
117
-
118
 
119
  def delete_unused_bin_files(model_id: str):
120
  target_path = f"/data/models--poltextlab--{model_id}"
121
 
122
  # delete files in blobs/
123
  blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
124
-
125
  # delete .bin files in snapshots/, except config.json
126
  snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)
127
 
@@ -136,16 +147,16 @@ def delete_unused_bin_files(model_id: str):
136
  elif os.path.isdir(path):
137
  print(f"Deleting directory: {path}")
138
  shutil.rmtree(path)
139
-
140
-
141
  def delete_http_folders():
142
  http_folders = glob.glob("/data/http*")
143
  for folder in http_folders:
144
  if os.path.isdir(folder):
145
  print(f"Deleting: {folder}")
146
  shutil.rmtree(folder)
147
-
148
-
149
  @contextmanager
150
  def hf_cleanup():
151
  delete_http_folders()
@@ -153,13 +164,15 @@ def hf_cleanup():
153
  yield
154
  finally:
155
  delete_http_folders()
156
-
157
-
158
  def scan_cache():
159
  # Scan Hugging Face model cache
160
- cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
 
 
161
  scan_result = scan_cache_dir(cache_dir)
162
-
163
  print("=== πŸ€— Hugging Face Model Cache ===")
164
  print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
165
  print(f"Number of repos: {len(scan_result.repos)}")
@@ -178,16 +191,17 @@ def scan_cache():
178
  size = os.path.getsize(path)
179
  total_size += size
180
  print(f"- {filename}: {size / 1e6:.2f} MB")
181
-
182
  print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
183
-
184
- def set_hf_cache_dir(path:str):
185
- os.environ['TRANSFORMERS_CACHE'] = path
186
- os.environ['HF_HOME'] = path
187
- os.environ['HF_DATASETS_CACHE'] = path
188
- os.environ['TORCH_HOME'] = path
189
-
190
-
 
191
  def set_torch_threads():
192
  torch.set_num_threads(1)
193
  os.environ["OMP_NUM_THREADS"] = "1"
@@ -196,8 +210,8 @@ def set_torch_threads():
196
 
197
  def is_disk_full(min_free_space_in_GB=10):
198
  total, used, free = shutil.disk_usage("/")
199
- free_gb = free / (1024 ** 3)
200
-
201
  if free_gb >= min_free_space_in_GB:
202
  return False
203
  else:
 
18
  from interfaces.cap import build_huggingface_path as hf_cap_path
19
  from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
20
  from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
21
+ from interfaces.cap_media_demo import (
22
+ build_huggingface_path as hf_cap_media_path,
23
+ ) # why... just follow the name template the next time pls
24
  from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
25
  from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
26
  from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
 
37
  HF_TOKEN = os.environ["hf_read"]
38
 
39
  # should be a temporary solution
40
+ models = [
41
+ hf_manifesto_path(""),
42
+ hf_sentiment_path(""),
43
+ hf_emotion_path(""),
44
+ hf_cap_minor_path("", ""),
45
+ hf_cap_minor_path("", "social"),
46
+ hf_ontolisst_path(""),
47
+ ]
48
 
49
  # it gets more difficult with cap
50
  domains_cap = list(domains_cap.values())
51
  for language in languages_cap:
52
  for domain in domains_cap:
53
  models.append(hf_cap_path(language, domain))
54
+
55
  # cap media
56
  models.append(hf_cap_media_path("", ""))
57
 
 
60
 
61
  # cap minor media
62
  models.append(hf_cap_minor_media_path("", "", False))
63
+
64
  # emotion9
65
  for language in languages_emotion9:
66
  models.append(hf_emotion9_path(language))
67
+
68
  # illframes (domains is a dict for some reason?)
69
  for domain in domains_illframes.values():
70
  models.append(hf_illframes_path(domain))
71
 
72
  tokenizers = ["xlm-roberta-large"]
73
 
74
+
75
  def download_hf_models():
76
  os.makedirs(JIT_DIR, exist_ok=True)
77
 
78
  for model_id in models:
79
  print(f"Downloading + JIT tracing model: {model_id}")
80
+
81
  safe_model_name = model_id.replace("/", "_")
82
  traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
83
+
84
  if os.path.exists(traced_model_path):
85
  delete_unused_bin_files(model_id)
86
  print(f"⏩ Skipping JIT β€” already exists: {traced_model_path}")
87
  else:
88
  print(f"βš™οΈ Tracing and saving: {traced_model_path}")
89
+
90
  model = AutoModelForSequenceClassification.from_pretrained(
91
+ model_id, token=HF_TOKEN, device_map="auto"
 
 
92
  )
93
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
94
 
 
100
  return_tensors="pt",
101
  padding=True,
102
  truncation=True,
103
+ max_length=64,
104
  )
105
 
106
  # JIT trace
107
  traced_model = torch.jit.trace(
108
  model,
109
  (dummy_input["input_ids"], dummy_input["attention_mask"]),
110
+ strict=False,
111
  )
112
 
113
  # Save traced model
114
  traced_model.save(traced_model_path)
115
  print(f"βœ”οΈ Saved JIT model to: {traced_model_path}")
116
+
117
+
118
  def df_h():
119
  df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
120
  print("=== Disk Free Space (df -H) ===")
121
  print(df_result.stdout)
122
 
123
+ du_result = subprocess.run(
124
+ ["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True
125
+ )
126
  print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
127
  print(du_result.stdout)
128
+
129
 
130
  def delete_unused_bin_files(model_id: str):
131
  target_path = f"/data/models--poltextlab--{model_id}"
132
 
133
  # delete files in blobs/
134
  blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
135
+
136
  # delete .bin files in snapshots/, except config.json
137
  snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)
138
 
 
147
  elif os.path.isdir(path):
148
  print(f"Deleting directory: {path}")
149
  shutil.rmtree(path)
150
+
151
+
152
  def delete_http_folders():
153
  http_folders = glob.glob("/data/http*")
154
  for folder in http_folders:
155
  if os.path.isdir(folder):
156
  print(f"Deleting: {folder}")
157
  shutil.rmtree(folder)
158
+
159
+
160
  @contextmanager
161
  def hf_cleanup():
162
  delete_http_folders()
 
164
  yield
165
  finally:
166
  delete_http_folders()
167
+
168
+
169
  def scan_cache():
170
  # Scan Hugging Face model cache
171
+ cache_dir = os.environ.get(
172
+ "TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")
173
+ )
174
  scan_result = scan_cache_dir(cache_dir)
175
+
176
  print("=== πŸ€— Hugging Face Model Cache ===")
177
  print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
178
  print(f"Number of repos: {len(scan_result.repos)}")
 
191
  size = os.path.getsize(path)
192
  total_size += size
193
  print(f"- {filename}: {size / 1e6:.2f} MB")
194
+
195
  print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
196
+
197
+
198
+ def set_hf_cache_dir(path: str):
199
+ os.environ["TRANSFORMERS_CACHE"] = path
200
+ os.environ["HF_HOME"] = path
201
+ os.environ["HF_DATASETS_CACHE"] = path
202
+ os.environ["TORCH_HOME"] = path
203
+
204
+
205
  def set_torch_threads():
206
  torch.set_num_threads(1)
207
  os.environ["OMP_NUM_THREADS"] = "1"
 
210
 
211
  def is_disk_full(min_free_space_in_GB=10):
212
  total, used, free = shutil.disk_usage("/")
213
+ free_gb = free / (1024**3)
214
+
215
  if free_gb >= min_free_space_in_GB:
216
  return False
217
  else: