Spaces:
Running
on
Zero
Running
on
Zero
attempt checkpointing
Browse files- modular_graph_and_candidates.py +52 -22
modular_graph_and_candidates.py
CHANGED
@@ -95,7 +95,6 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
95 |
out[(m1, m2)] = s
|
96 |
return out
|
97 |
|
98 |
-
|
99 |
@spaces.GPU
|
100 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
101 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
|
@@ -113,11 +112,10 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
113 |
|
114 |
texts = {}
|
115 |
for name in tqdm(missing, desc="Reading modeling files"):
|
116 |
-
# Skip models that cause GPU task aborts
|
117 |
if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
|
118 |
print(f"Skipping {name} (causes GPU abort)")
|
119 |
continue
|
120 |
-
|
121 |
code = ""
|
122 |
for py in (models_root / name).rglob("modeling_*.py"):
|
123 |
try:
|
@@ -130,29 +128,54 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
130 |
all_embeddings = []
|
131 |
|
132 |
print(f"Encoding embeddings for {len(names)} models...")
|
133 |
-
batch_size = 4 #
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
batch_names = names[i:i+batch_size]
|
137 |
batch_texts = [texts[name] for name in batch_names]
|
138 |
-
|
139 |
try:
|
140 |
print(f"Processing batch: {batch_names}")
|
141 |
emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
|
142 |
-
all_embeddings.append(emb)
|
143 |
-
print(f"β Completed batch of {len(batch_names)} models")
|
144 |
-
|
145 |
-
# Clear GPU cache every 3 batches to prevent memory accumulation
|
146 |
-
if i % (3 * batch_size) == 0 and torch.cuda.is_available():
|
147 |
-
torch.cuda.empty_cache()
|
148 |
-
torch.cuda.synchronize() # Force GPU sync
|
149 |
-
print(f"π§Ή Cleared GPU cache after batch {i//batch_size + 1}")
|
150 |
-
|
151 |
except Exception as e:
|
152 |
print(f"β οΈ GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
158 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
@@ -162,19 +185,26 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
162 |
sims_mat = embeddings @ embeddings.T
|
163 |
|
164 |
out = {}
|
165 |
-
matrix_size = embeddings.shape[0]
|
166 |
-
processed_names = names[:matrix_size]
|
167 |
-
|
168 |
for i in range(matrix_size):
|
169 |
for j in range(i + 1, matrix_size):
|
170 |
s = float(sims_mat[i, j])
|
171 |
if s >= thr:
|
172 |
out[(processed_names[i], processed_names[j])] = s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
return out
|
174 |
|
175 |
|
176 |
|
177 |
|
|
|
178 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
179 |
# 2) Scan *modular_*.py* files to build an importβdependency graph
|
180 |
# β only **modeling_*** imports are considered (skip configuration / processing)
|
|
|
95 |
out[(m1, m2)] = s
|
96 |
return out
|
97 |
|
|
|
98 |
@spaces.GPU
|
99 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
100 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
|
|
|
112 |
|
113 |
texts = {}
|
114 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
|
115 |
if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
|
116 |
print(f"Skipping {name} (causes GPU abort)")
|
117 |
continue
|
118 |
+
|
119 |
code = ""
|
120 |
for py in (models_root / name).rglob("modeling_*.py"):
|
121 |
try:
|
|
|
128 |
all_embeddings = []
|
129 |
|
130 |
print(f"Encoding embeddings for {len(names)} models...")
|
131 |
+
batch_size = 4 # keep your default
|
132 |
+
|
133 |
+
# ββ checkpoint / resume ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
134 |
+
ckpt_path = models_root / "__emb_ckpt.npz"
|
135 |
+
start_idx = 0
|
136 |
+
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
137 |
+
|
138 |
+
if ckpt_path.exists():
|
139 |
+
try:
|
140 |
+
ckpt = np.load(ckpt_path, allow_pickle=True)
|
141 |
+
ckpt_names = list(ckpt["names"])
|
142 |
+
if names[:len(ckpt_names)] == ckpt_names:
|
143 |
+
loaded = ckpt["embeddings"].astype(np.float32)
|
144 |
+
all_embeddings.append(loaded)
|
145 |
+
start_idx = len(ckpt_names)
|
146 |
+
print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
|
147 |
+
except Exception as e:
|
148 |
+
print(f"β οΈ Failed to load checkpoint: {type(e).__name__}: {e}")
|
149 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
150 |
+
|
151 |
+
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
152 |
batch_names = names[i:i+batch_size]
|
153 |
batch_texts = [texts[name] for name in batch_names]
|
154 |
+
|
155 |
try:
|
156 |
print(f"Processing batch: {batch_names}")
|
157 |
emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
except Exception as e:
|
159 |
print(f"β οΈ GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
|
160 |
+
emb = np.zeros((len(batch_names), emb_dim), dtype=np.float32)
|
161 |
+
|
162 |
+
all_embeddings.append(emb)
|
163 |
+
|
164 |
+
# save checkpoint after each batch
|
165 |
+
try:
|
166 |
+
cur = np.vstack(all_embeddings).astype(np.float32)
|
167 |
+
np.savez(
|
168 |
+
ckpt_path,
|
169 |
+
embeddings=cur,
|
170 |
+
names=np.array(names[:i+len(batch_names)], dtype=object),
|
171 |
+
)
|
172 |
+
except Exception as e:
|
173 |
+
print(f"β οΈ Failed to write checkpoint: {type(e).__name__}: {e}")
|
174 |
+
|
175 |
+
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
176 |
+
torch.cuda.empty_cache()
|
177 |
+
torch.cuda.synchronize()
|
178 |
+
print(f"π§Ή Cleared GPU cache after batch {(i - start_idx)//batch_size + 1}")
|
179 |
|
180 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
181 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
|
|
185 |
sims_mat = embeddings @ embeddings.T
|
186 |
|
187 |
out = {}
|
188 |
+
matrix_size = embeddings.shape[0]
|
189 |
+
processed_names = names[:matrix_size]
|
|
|
190 |
for i in range(matrix_size):
|
191 |
for j in range(i + 1, matrix_size):
|
192 |
s = float(sims_mat[i, j])
|
193 |
if s >= thr:
|
194 |
out[(processed_names[i], processed_names[j])] = s
|
195 |
+
|
196 |
+
# best-effort cleanup
|
197 |
+
try:
|
198 |
+
ckpt_path.unlink()
|
199 |
+
except Exception:
|
200 |
+
pass
|
201 |
+
|
202 |
return out
|
203 |
|
204 |
|
205 |
|
206 |
|
207 |
+
|
208 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
209 |
# 2) Scan *modular_*.py* files to build an importβdependency graph
|
210 |
# β only **modeling_*** imports are considered (skip configuration / processing)
|