import gradio as gr import spaces import torch from torch.cuda.amp import autocast import subprocess from huggingface_hub import InferenceClient import os import psutil import json import subprocess from threading import Thread import torch import spaces import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch from accelerate import Accelerator subprocess.run( "pip install psutil", shell=True, ) import bitsandbytes as bnb # Import bitsandbytes for 8-bit quantization from datetime import datetime subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # pip install 'git+https://github.com/huggingface/transformers.git' token=os.getenv('token') print('token = ',token) from transformers import AutoModelForCausalLM, AutoTokenizer import transformers # model_id = "mistralai/Mistral-7B-v0.3" # model_id = "microsoft/Phi-3-medium-4k-instruct" # # model_id = "microsoft/phi-4" # # model_id = "Qwen/Qwen2-7B-Instruct" # tokenizer = AutoTokenizer.from_pretrained( # # model_id # model_id, # # use_fast=False # token= token, # trust_remote_code=True) # accelerator = Accelerator() # model = AutoModelForCausalLM.from_pretrained(model_id, token= token, # # torch_dtype= torch.uint8, # torch_dtype=torch.bfloat16, # # load_in_8bit=True, # # # # torch_dtype=torch.fl, # attn_implementation="flash_attention_2", # low_cpu_mem_usage=True, # trust_remote_code=True, # device_map='cuda', # # device_map=accelerator.device_map, # ) # # # model = accelerator.prepare(model) # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # pipe = pipeline( # "text-generation", # model=model, # tokenizer=tokenizer, # ) # pipeline = transformers.pipeline( # "text-generation", # model="microsoft/phi-4", # model_kwargs={"torch_dtype": "auto"}, # device_map="auto", # ) # device_map = infer_auto_device_map(model, max_memory={0: "79GB", "cpu":"65GB" }) # Load the model with the inferred device map # model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, no_split_module_classes=["GPTJBlock"]) # model.half() MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" CHAT_TEMPLATE = "ŮŽAuto" MODEL_NAME = MODEL_ID.split("/")[-1] CONTEXT_LENGTH = 16000 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", quantization_config=quantization_config, attn_implementation="flash_attention_2", ) import json def str_to_json(str_obj): json_obj = json.loads(str_obj) return json_obj @spaces.GPU(duration=60) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p): stop_tokens = ["<|endoftext|>", "<|im_end|>"] instruction = '<|im_start|>system\n' + system_message + '\n<|im_end|>\n' for user, assistant in history: instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n' instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n' print(instruction) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True) input_ids, attention_mask = enc.input_ids, enc.attention_mask if input_ids.shape[1] > CONTEXT_LENGTH: input_ids = input_ids[:, -CONTEXT_LENGTH:] attention_mask = attention_mask[:, -CONTEXT_LENGTH:] generate_kwargs = dict( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), streamer=streamer, do_sample=True, temperature=temperature, max_new_tokens=max_tokens, top_k=40, repetition_penalty=1.1, top_p=0.95 ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] for new_token in streamer: print(new_token," ") outputs.append(new_token) if new_token in stop_tokens: break yield "".join(outputs) # yield 'retuend' # model.to(accelerator.device) # messages = [] # json_obj = str_to_json(message) # print(json_obj) # messages= json_obj # # input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(accelerator.device) # # input_ids2 = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt") #.to('cuda') # # print(f"Converted input_ids dtype: {input_ids.dtype}") # # input_str= str(input_ids2) # # print('input str = ', input_str) # generation_args = { # "max_new_tokens": max_tokens, # "return_full_text": False, # "temperature": temperature, # "do_sample": False, # } # output = pipe(messages, **generation_args) # print(output[0]['generated_text']) # gen_text=output[0]['generated_text'] # # with torch.no_grad(): # # gen_tokens = model.generate( # # input_ids, # # max_new_tokens=max_tokens, # # # do_sample=True, # # temperature=temperature, # # ) # # gen_text = tokenizer.decode(gen_tokens[0]) # # print(gen_text) # # gen_text= gen_text.replace(input_str,'') # # gen_text= gen_text.replace('<|im_end|>','') # yield gen_text # messages = [ # # {"role": "user", "content": "What is your favourite condiment?"}, # # {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, # # {"role": "user", "content": "Do you have mayonnaise recipes?"} # ] # inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda") # outputs = model.generate(inputs, max_new_tokens=2000) # gen_text=tokenizer.decode(outputs[0], skip_special_tokens=True) # print(gen_text) # yield gen_text # for val in history: # if val[0]: # messages.append({"role": "user", "content": val[0]}) # if val[1]: # messages.append({"role": "assistant", "content": val[1]}) # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # token = message.choices[0].delta.content # response += token # yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()