SLM-RAG-Arena

Running on Zero

App Files Files Community

oliver-aizip commited on May 22

Commit

c0fdd5a

1 Parent(s): e2b5d99

remove bitnet handling completely

Browse files

Files changed (1) hide show

utils/models.py +2 -46

utils/models.py CHANGED Viewed

@@ -11,7 +11,6 @@ from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
     StoppingCriteria,
-    BitNetForCausalLM
 )
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
@@ -156,25 +155,7 @@ def run_inference(model_name, context, question):
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
-        if "bitnet" in model_name.lower():
-            bitnet_model = BitNetForCausalLM.from_pretrained(
-                model_name,
-                #device_map="auto",
-                torch_dtype=torch.bfloat16,
-                #trust_remote_code=True,
-            )
-            pipe = pipeline(
-                "text-generation",
-                model=bitnet_model,
-                tokenizer=tokenizer,
-                #device_map="auto",
-                #trust_remote_code=True,
-                torch_dtype=torch.bfloat16,
-                model_kwargs={
-                    "attn_implementation": "eager",
-                },
-            )
-        elif "icecream" not in model_name.lower():
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
@@ -221,12 +202,8 @@ def run_inference(model_name, context, question):
                 **tokenizer_kwargs,
             )
             model_inputs = model_inputs.to(model.device)
             input_ids = model_inputs.input_ids
-            attention_mask = model_inputs.attention_mask
             prompt_tokens_length = input_ids.shape[1]
             with torch.inference_mode():
@@ -235,33 +212,12 @@ def run_inference(model_name, context, question):
                     return ""
                 output_sequences = model.generate(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
                     max_new_tokens=512,
-                    eos_token_id=tokenizer.eos_token_id,
-                    pad_token_id=tokenizer.pad_token_id  # Addresses the warning
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
-        # elif "bitnet" in model_name.lower():
-        #     formatted = tokenizer.apply_chat_template(
-        #         text_input,
-        #         tokenize=True,
-        #         return_tensors="pt",
-        #         return_dict=True,
-        #         **tokenizer_kwargs,
-        #     ).to(bitnet_model.device)
-        #     with torch.inference_mode():
-        #         # Check interrupt before generation
-        #         if generation_interrupt.is_set():
-        #             return ""
-        #         output_sequences = bitnet_model.generate(
-        #             **formatted,
-        #             max_new_tokens=512,
-        #         )
-        #         result = tokenizer.decode(output_sequences[0][formatted['input_ids'].shape[-1]:], skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,

     AutoTokenizer,
     AutoModelForCausalLM,
     StoppingCriteria,
 )
 from .prompts import format_rag_prompt
 from .shared import generation_interrupt
         print("REACHED HERE BEFORE pipe")
         print(f"Loading model {model_name}...")
+        if "icecream" not in model_name.lower():
             pipe = pipeline(
                 "text-generation",
                 model=model_name,
                 **tokenizer_kwargs,
             )
             model_inputs = model_inputs.to(model.device)
             input_ids = model_inputs.input_ids
             prompt_tokens_length = input_ids.shape[1]
             with torch.inference_mode():
                     return ""
                 output_sequences = model.generate(
+                    **model_inputs,
                     max_new_tokens=512,
                 )
             generated_token_ids = output_sequences[0][prompt_tokens_length:]
             result = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
         else:  # For other models
             formatted = pipe.tokenizer.apply_chat_template(
                 text_input,