Molbap HF Staff commited on
Commit
4fa1ace
Β·
1 Parent(s): b702ae2

attempt checkpointing

Browse files
Files changed (1) hide show
  1. modular_graph_and_candidates.py +52 -22
modular_graph_and_candidates.py CHANGED
@@ -95,7 +95,6 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
95
  out[(m1, m2)] = s
96
  return out
97
 
98
-
99
  @spaces.GPU
100
  def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
101
  model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
@@ -113,11 +112,10 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
113
 
114
  texts = {}
115
  for name in tqdm(missing, desc="Reading modeling files"):
116
- # Skip models that cause GPU task aborts
117
  if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
118
  print(f"Skipping {name} (causes GPU abort)")
119
  continue
120
-
121
  code = ""
122
  for py in (models_root / name).rglob("modeling_*.py"):
123
  try:
@@ -130,29 +128,54 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
130
  all_embeddings = []
131
 
132
  print(f"Encoding embeddings for {len(names)} models...")
133
- batch_size = 4 # Reduced to be more conservative
134
-
135
- for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  batch_names = names[i:i+batch_size]
137
  batch_texts = [texts[name] for name in batch_names]
138
-
139
  try:
140
  print(f"Processing batch: {batch_names}")
141
  emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
142
- all_embeddings.append(emb)
143
- print(f"βœ“ Completed batch of {len(batch_names)} models")
144
-
145
- # Clear GPU cache every 3 batches to prevent memory accumulation
146
- if i % (3 * batch_size) == 0 and torch.cuda.is_available():
147
- torch.cuda.empty_cache()
148
- torch.cuda.synchronize() # Force GPU sync
149
- print(f"🧹 Cleared GPU cache after batch {i//batch_size + 1}")
150
-
151
  except Exception as e:
152
  print(f"⚠️ GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
153
- # Create zero embeddings for all models in failed batch
154
- zero_emb = np.zeros((len(batch_names), model.get_sentence_embedding_dimension()), dtype=np.float32)
155
- all_embeddings.append(zero_emb)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  embeddings = np.vstack(all_embeddings).astype(np.float32)
158
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
@@ -162,19 +185,26 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
162
  sims_mat = embeddings @ embeddings.T
163
 
164
  out = {}
165
- matrix_size = embeddings.shape[0] # Actual number of embeddings we have
166
- processed_names = names[:matrix_size] # Only use names that have embeddings
167
-
168
  for i in range(matrix_size):
169
  for j in range(i + 1, matrix_size):
170
  s = float(sims_mat[i, j])
171
  if s >= thr:
172
  out[(processed_names[i], processed_names[j])] = s
 
 
 
 
 
 
 
173
  return out
174
 
175
 
176
 
177
 
 
178
  # ────────────────────────────────────────────────────────────────────────────────
179
  # 2) Scan *modular_*.py* files to build an import‑dependency graph
180
  # – only **modeling_*** imports are considered (skip configuration / processing)
 
95
  out[(m1, m2)] = s
96
  return out
97
 
 
98
  @spaces.GPU
99
  def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
100
  model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
 
112
 
113
  texts = {}
114
  for name in tqdm(missing, desc="Reading modeling files"):
 
115
  if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
116
  print(f"Skipping {name} (causes GPU abort)")
117
  continue
118
+
119
  code = ""
120
  for py in (models_root / name).rglob("modeling_*.py"):
121
  try:
 
128
  all_embeddings = []
129
 
130
  print(f"Encoding embeddings for {len(names)} models...")
131
+ batch_size = 4 # keep your default
132
+
133
+ # ── checkpoint / resume ────────────────────────────────────────────────────
134
+ ckpt_path = models_root / "__emb_ckpt.npz"
135
+ start_idx = 0
136
+ emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
137
+
138
+ if ckpt_path.exists():
139
+ try:
140
+ ckpt = np.load(ckpt_path, allow_pickle=True)
141
+ ckpt_names = list(ckpt["names"])
142
+ if names[:len(ckpt_names)] == ckpt_names:
143
+ loaded = ckpt["embeddings"].astype(np.float32)
144
+ all_embeddings.append(loaded)
145
+ start_idx = len(ckpt_names)
146
+ print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
147
+ except Exception as e:
148
+ print(f"⚠️ Failed to load checkpoint: {type(e).__name__}: {e}")
149
+ # ───────────────────────────────────────────────────────────────────────────
150
+
151
+ for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
152
  batch_names = names[i:i+batch_size]
153
  batch_texts = [texts[name] for name in batch_names]
154
+
155
  try:
156
  print(f"Processing batch: {batch_names}")
157
  emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
 
 
 
 
 
 
 
 
 
158
  except Exception as e:
159
  print(f"⚠️ GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
160
+ emb = np.zeros((len(batch_names), emb_dim), dtype=np.float32)
161
+
162
+ all_embeddings.append(emb)
163
+
164
+ # save checkpoint after each batch
165
+ try:
166
+ cur = np.vstack(all_embeddings).astype(np.float32)
167
+ np.savez(
168
+ ckpt_path,
169
+ embeddings=cur,
170
+ names=np.array(names[:i+len(batch_names)], dtype=object),
171
+ )
172
+ except Exception as e:
173
+ print(f"⚠️ Failed to write checkpoint: {type(e).__name__}: {e}")
174
+
175
+ if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
176
+ torch.cuda.empty_cache()
177
+ torch.cuda.synchronize()
178
+ print(f"🧹 Cleared GPU cache after batch {(i - start_idx)//batch_size + 1}")
179
 
180
  embeddings = np.vstack(all_embeddings).astype(np.float32)
181
  norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
 
185
  sims_mat = embeddings @ embeddings.T
186
 
187
  out = {}
188
+ matrix_size = embeddings.shape[0]
189
+ processed_names = names[:matrix_size]
 
190
  for i in range(matrix_size):
191
  for j in range(i + 1, matrix_size):
192
  s = float(sims_mat[i, j])
193
  if s >= thr:
194
  out[(processed_names[i], processed_names[j])] = s
195
+
196
+ # best-effort cleanup
197
+ try:
198
+ ckpt_path.unlink()
199
+ except Exception:
200
+ pass
201
+
202
  return out
203
 
204
 
205
 
206
 
207
+
208
  # ────────────────────────────────────────────────────────────────────────────────
209
  # 2) Scan *modular_*.py* files to build an import‑dependency graph
210
  # – only **modeling_*** imports are considered (skip configuration / processing)