Synthyra
/

FastESM2_650

Model card Files Files and versions Community

lhallee commited on Dec 7, 2024

Commit

2e16ff9

·

verified ·

1 Parent(s): bcdd73d

Update modeling_fastesm.py

Files changed (1) hide show

modeling_fastesm.py +16 -16

modeling_fastesm.py CHANGED Viewed

@@ -442,22 +442,22 @@ class FastEsmPreTrainedModel(PreTrainedModel):
             to_embed = [seq for seq in sequences if seq not in already_embedded]
             print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
             print(f"Embedding {len(to_embed)} new sequences")
-            with torch.no_grad():
-                for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
-                    seqs = sequences[i * batch_size:(i + 1) * batch_size]
-                    input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
-                    residue_embeddings = self.forward(input_ids, attention_mask, output_hidden_states=True).hidden_states[-1].detach().float() # required for sql
-                    embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
-                    for seq, emb in zip(seqs, embeddings):
-                        c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?)",
-                                (seq, emb.cpu().numpy().tobytes()))
-                    if (i + 1) % 100 == 0:
-                        conn.commit()
-            conn.commit()
             conn.close()
             return None

             to_embed = [seq for seq in sequences if seq not in already_embedded]
             print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
             print(f"Embedding {len(to_embed)} new sequences")
+            if len(to_embed) > 0:
+                with torch.no_grad():
+                    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
+                        seqs = sequences[i * batch_size:(i + 1) * batch_size]
+                        input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
+                        residue_embeddings = self.forward(input_ids, attention_mask, output_hidden_states=True).hidden_states[-1].detach().float() # required for sql
+                        embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
+                        for seq, emb in zip(seqs, embeddings):
+                            c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?)",
+                                    (seq, emb.cpu().numpy().tobytes()))
+                        if (i + 1) % 100 == 0:
+                            conn.commit()
+                conn.commit()
             conn.close()
             return None