Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Running on Zero

App Files Files Community

AbstractPhil commited on Jun 2

Commit

5e20a2a

verified ·

1 Parent(s): 0a14990

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -88

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
-# app.py – encoder-only + masking accuracy demo for bert-beatrix-2048
-# -----------------------------------------------------------------
-# launch:  python app.py      (UI at http://localhost:7860)
 import json, re, sys
 from pathlib import Path, PurePosixPath
@@ -9,37 +10,48 @@ import gradio as gr
 import spaces
 import torch
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
-# 0.  download repo + patch auto_map --------------------------------
-REPO_ID  = "AbstractPhil/bert-beatrix-2048"
-LOCAL_CK = "bert-beatrix-2048"
-snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_CK, local_dir_use_symlinks=False)
-cfg_p = Path(LOCAL_CK) / "config.json"
-with cfg_p.open() as f:
-    cfg = json.load(f)
-for k, v in cfg.get("auto_map", {}).items():
     if "--" in v:
-        cfg["auto_map"][k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
-        with cfg_p.open("w") as f:
-            json.dump(cfg, f, indent=2)
 # ------------------------------------------------------------------
-# 1.  load model / tokenizer ---------------------------------------
-handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CK)
 full_model = full_model.eval().cuda()
-encoder    = full_model.bert.encoder
-embeddings = full_model.bert.embeddings
-emb_ln     = full_model.bert.emb_ln
-emb_drop   = full_model.bert.emb_drop
-MASK = tokenizer.mask_token or "[MASK]"
 # ------------------------------------------------------------------
-# 2.  symbolic role list -------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
@@ -48,96 +60,108 @@ SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
-miss = [t for t in SYMBOLIC_ROLES
-        if tokenizer.convert_tokens_to_ids(t) == tokenizer.unk_token_id]
-if miss:
-    sys.exit(f"❌ Tokenizer missing {miss}")
 # ------------------------------------------------------------------
-# 3.  inference util  ----------------------------------------------
 @spaces.GPU
-def encode_and_trace(text: str, selected_roles: list[str]):
-    # ----- 3-A. build masked version & encode original --------------
-    sel_ids = {tokenizer.convert_tokens_to_ids(t) for t in selected_roles}
-    # tokenised “plain” text
-    plain = tokenizer(text, return_tensors="pt").to("cuda")
-    ids_plain = plain.input_ids
-    # make masked string (regex to avoid partial hits)
-    masked_txt = text
-    for tok in selected_roles:
-        masked_txt = re.sub(re.escape(tok), MASK, masked_txt)
-    masked = tokenizer(masked_txt, return_tensors="pt").to("cuda")
-    ids_masked = masked.input_ids
-    # ----- 3-B. run model on masked text ----------------------------
-    with torch.no_grad():
-        logits = full_model(**masked).logits[0]          # (S, V)
-        preds  = logits.argmax(-1)                       # (S,)
-    # ----- 3-C. gather stats per masked role ------------------------
-    found_tokens, correct = [], 0
-    role_flags = []
-    for i, (orig_id, pred_id) in enumerate(zip(ids_plain[0], preds)):
-        if orig_id.item() in sel_ids and ids_masked[0, i].item() == tokenizer.mask_token_id:
-            found_tokens.append(tokenizer.convert_ids_to_tokens([orig_id])[0])
-            correct += int(orig_id.item() == pred_id.item())
-            role_flags.append(i)
-    total = len(role_flags)
-    acc   = correct / total if total else 0.0
-    # ----- 3-D. encoder rep pooling for *all* selected roles --------
-    with torch.no_grad():
-        # embeddings -> normed reps
-        x = emb_drop(emb_ln(embeddings(ids_plain)))
-        attn = full_model.bert.get_extended_attention_mask(
-            plain.attention_mask, x.shape[:-1]
-        )
-        enc = encoder(x, attention_mask=attn)            # (1,S,H)
-        mask_vec = torch.tensor(
-            [tid in sel_ids for tid in ids_plain[0].tolist()], device=enc.device
-        )
-        if mask_vec.any():
-            pooled = enc[0][mask_vec].mean(0)
-            norm   = f"{pooled.norm().item():.4f}"
         else:
-            norm   = "0.0000"
-    tokens_str = ", ".join(found_tokens) or "(none)"
-    return tokens_str, norm, f"{acc*100:.1f}%"
 # ------------------------------------------------------------------
-# 4.  gradio UI  ----------------------------------------------------
-def app():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
-            "## 🧠 Symbolic Encoder Inspector  \n"
-            "1. Model side: we *mask* every chosen role token, run the LM, and report how often it recovers the original.  \n"
-            "2. Encoder side: we also pool hidden-state vectors for those roles and give their mean L2-norm."
         )
         with gr.Row():
             with gr.Column():
                 txt = gr.Textbox(
                     label="Input with Symbolic Tokens",
-                    lines=3,
                     placeholder="Example: A <subject> wearing <upper_body_clothing> …",
                 )
                 roles = gr.CheckboxGroup(
                     choices=SYMBOLIC_ROLES,
-                    value=SYMBOLIC_ROLES,            # <- all pre-selected
-                    label="Roles to mask & trace",
                 )
-                run = gr.Button("Run")
             with gr.Column():
-                o_tok  = gr.Textbox(label="Masked-role tokens found")
-                o_norm = gr.Textbox(label="Mean hidden-state L2-norm")
-                o_acc  = gr.Textbox(label="Recovery accuracy")
-        run.click(encode_and_trace, [txt, roles], [o_tok, o_norm, o_acc])
     return demo
 if __name__ == "__main__":
-    app().launch()

+# app.py – encoder-only demo for bert-beatrix-2048
+# ------------------------------------------------------------------
+# launch:  python app.py
+# ------------------------------------------------------------------
 import json, re, sys
 from pathlib import Path, PurePosixPath
 import spaces
 import torch
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
+# 0.  Download & patch config.json  --------------------------------
+# ------------------------------------------------------------------
+REPO_ID   = "AbstractPhil/bert-beatrix-2048"
+LOCAL_DIR = "bert-beatrix-2048"
+snapshot_download(REPO_ID, revision="main",
+                  local_dir=LOCAL_DIR, local_dir_use_symlinks=False)
+cfg_path = Path(LOCAL_DIR) / "config.json"
+cfg      = json.loads(cfg_path.read_text())
+auto_map, changed = cfg.get("auto_map", {}), False
+for k, v in auto_map.items():
     if "--" in v:
+        auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
+        changed = True
+if changed:
+    cfg["auto_map"] = auto_map
+    cfg_path.write_text(json.dumps(cfg, indent=2))
+    print("🛠️  Patched config.json → auto_map now points at local modules")
+# ------------------------------------------------------------------
+# 1.  Model / tokenizer  -------------------------------------------
 # ------------------------------------------------------------------
+handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_DIR)
 full_model = full_model.eval().cuda()
+encoder     = full_model.bert.encoder
+embeddings  = full_model.bert.embeddings
+emb_ln      = full_model.bert.emb_ln
+emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
+# 2.  Symbolic token set  ------------------------------------------
+# ------------------------------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
+missing = [t for t in SYMBOLIC_ROLES
+           if tokenizer.convert_tokens_to_ids(t) == tokenizer.unk_token_id]
+if missing:
+    sys.exit(f"❌ Tokenizer is missing {missing}")
 # ------------------------------------------------------------------
+# 3.  Encoder + *mask-inference* util  ------------------------------
+# ------------------------------------------------------------------
+MASK = tokenizer.mask_token or "[MASK]"
 @spaces.GPU
+def encode_and_trace(text: str, _ignored):          # all roles auto-selected
+    """
+    1.   run encoder pass     → cosine report (as before)
+    2.   mask **every** symbolic token one-at-a-time
+         and ask the full model to predict it back.
+         Accuracy over those positions is returned.
+    """
+    if not text.strip():
+        return "(empty)", "0.0000", 0, "0 / 0 (0.0%)"
+    with torch.no_grad():
+        # --------   ENCODER PROBE (unchanged)   ------------------
+        batch = tokenizer(text, return_tensors="pt").to("cuda")
+        ids, mask = batch.input_ids, batch.attention_mask
+        x   = emb_drop(emb_ln(embeddings(ids)))
+        am  = full_model.bert.get_extended_attention_mask(mask, x.shape[:-1])
+        enc = encoder(x, attention_mask=am)                           # (1,S,H)
+        sel_ids = {tokenizer.convert_tokens_to_ids(t) for t in SYMBOLIC_ROLES}
+        flags   = torch.tensor([tid in sel_ids for tid in ids[0].tolist()],
+                               device=enc.device)
+        found   = [tokenizer.convert_ids_to_tokens([tid])[0]
+                   for tid in ids[0].tolist() if tid in sel_ids]
+        tokens_str = ", ".join(found) if found else "(none)"
+        if flags.any():
+            vec  = enc[0][flags].mean(0)
+            norm = f"{vec.norm().item():.4f}"
         else:
+            norm = "0.0000"
+        # --------   MASK-AND-PREDICT ACCURACY   ------------------
+        correct, total = 0, 0
+        for pos, tid in enumerate(ids[0].tolist()):
+            if tid in sel_ids:                                        # symbolic
+                total += 1
+                masked_ids      = ids.clone()
+                masked_ids[0, pos] = tokenizer.mask_token_id
+                out = full_model(input_ids=masked_ids,
+                                 attention_mask=mask).logits          # (1,S,V)
+                pred = out[0, pos].argmax(-1).item()
+                if pred == tid:
+                    correct += 1
+        acc_str = f"{correct} / {total} ({(correct/total*100 if total else 0):.1f}%)"
+    return tokens_str, norm, len(found), acc_str
 # ------------------------------------------------------------------
+# 4.  Gradio UI  ----------------------------------------------------
+# ------------------------------------------------------------------
+def build_interface():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
+            "## 🧠 Symbolic Encoder Inspector\n"
+            "Enter text containing the `<role>` tokens.\n"
+            "Cosine probe **and** real mask-prediction accuracy are shown."
         )
         with gr.Row():
             with gr.Column():
                 txt = gr.Textbox(
                     label="Input with Symbolic Tokens",
                     placeholder="Example: A <subject> wearing <upper_body_clothing> …",
+                    lines=3,
                 )
+                # checkbox group kept (pre-checked, disabled)
                 roles = gr.CheckboxGroup(
                     choices=SYMBOLIC_ROLES,
+                    label="(all roles auto-selected)",
+                    value=SYMBOLIC_ROLES,
+                    interactive=False,
                 )
+                btn = gr.Button("Run probe + MLM check")
             with gr.Column():
+                out_tok  = gr.Textbox(label="Symbolic Tokens Found")
+                out_norm = gr.Textbox(label="Vector-norm (mean)")
+                out_cnt  = gr.Textbox(label="Token Count")
+                out_acc  = gr.Textbox(label="Mask-prediction accuracy")
+        btn.click(encode_and_trace,
+                  inputs=[txt, roles],
+                  outputs=[out_tok, out_norm, out_cnt, out_acc])
     return demo
 if __name__ == "__main__":
+    build_interface().launch()