SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 21

Commit

593a8e7

1 Parent(s): 0226e6c

separate bitnet from generic pipeline

Browse files

Files changed (1) hide show

utils/models.py +26 -2

utils/models.py CHANGED Viewed

@@ -11,7 +11,7 @@ from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
     StoppingCriteria,
-    StoppingCriteriaList,
 )
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
@@ -156,7 +156,14 @@ def run_inference(model_name, context, question):
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
-        if "icecream" not in model_name.lower():
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
@@ -226,7 +233,24 @@ def run_inference(model_name, context, question):
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,

     AutoTokenizer,
     AutoModelForCausalLM,
     StoppingCriteria,
+    BitNetForCausalLM
 )
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
+        if "bitnet" in model_name.lower():
+            bitnet_model = BitNetForCausalLM.from_pretrained(
+                model_name,
+                device_map="cuda",
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+            )
+        elif "icecream" not in model_name.lower():
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
+        elif "bitnet" in model_name.lower():
+            formatted = tokenizer.apply_chat_template(
+                text_input,
+                tokenize=True,
+                return_tensors="pt",
+                return_dict=True,
+                **tokenizer_kwargs,
+            ).to(device)
+            with torch.inference_mode():
+                # Check interrupt before generation
+                if generation_interrupt.is_set():
+                    return ""
+                output_sequences = bitnet_model.generate(
+                    **formatted,
+                    max_new_tokens=512,
+                )
+                result = tokenizer.decode(output_sequences[0][formatted['input_ids'].shape[-1]:], skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,