Spaces:
Sleeping
Sleeping
File size: 2,238 Bytes
925fb71 8673db4 925fb71 570ff00 a4ad80b 796a7d2 a4ad80b 8673db4 53aff72 8673db4 a4ad80b 8673db4 2ad699d 925fb71 868a097 6b7d4bc 868a097 6b7d4bc 7842d10 6b7d4bc 925fb71 958e70f 925fb71 6b7d4bc a4ad80b 925fb71 6b7d4bc 7842d10 925fb71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
import huggingface_hub
import os
import spaces
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration
from datasets import load_dataset
huggingface_hub.login(os.getenv('HF_TOKEN'))
#peft_model_id = "debisoft/DeepSeek-R1-Distill-Qwen-7B-thinking-function_calling-quant-V0"
#peft_model_id = "debisoft/Qwen2.5-VL-7B-Instruct-thinking-function_calling-quant-V0"
peft_model_id = "debisoft/Qwen2.5-VL-3B-Instruct-thinking-function_calling-V0"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
device = "auto"
cuda_device = torch.device("cuda")
cpu_device = torch.device("cpu")
config = PeftConfig.from_pretrained(peft_model_id)
#model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(config.base_model_name_or_path,
quantization_config=bnb_config,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.resize_token_embeddings(len(tokenizer))
#tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
#model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
@spaces.GPU
def sentience_check():
peft_model = PeftModel.from_pretrained(model, peft_model_id, device_map="cuda"
#offload_folder = "offload/"
)
#peft_model.to(torch.bfloat16)
peft_model.eval()
#peft_model.to(cuda_device)
inputs = tokenizer("Are you sentient?", return_tensors="pt").to(cuda_device)
with torch.no_grad():
outputs = peft_model.generate(
**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id
)
#peft_model.to(cpu_device)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
demo = gr.Interface(fn=sentience_check, inputs=None, outputs=gr.Text())
demo.launch()
|