from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling from datasets import Dataset, DatasetDict from trl import ModelConfig, SFTTrainer, get_kbit_device_map, get_peft_config, get_quantization_config # from trl import SFTTrainer, DataCollatorForCompletionOnlyLM from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from accelerate import PartialState import pandas as pd import torch import optimum import bitsandbytes import pprint import evaluate from datasets import load_metric import json from torcheval.metrics.functional.text import bleu_score import gradio as gr import spaces # Read and preprocess data TURN_TEMPLATE = "<|im_start|>{role}\n{content}\n" TURN_PREFIX = "<|im_start|>{role}\n" start_token = "<|im_start|>" end_token = "" zero = torch.Tensor([0]).cuda() print(zero.device) # <-- 'cpu' 🤔 # Load your fine-tuned model and tokenizer surMuy_model_id = "AingHongsin/SurMuy_v1_512512201" model = AutoModelForCausalLM.from_pretrained(surMuy_model_id, device_map={'': 0}, revision="main", torch_dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained(surMuy_model_id) model.eval() model.to(zero.device) def deFormat(data): # Find the start and end indices of each turn in the data turn_indices = [] start_index = data.find(start_token) while start_index != -1: end_index = data.find(end_token, start_index) if end_index != -1: turn_indices.append((start_index, end_index + len(end_token))) else: turn_indices.append((start_index, len(data))) start_index = data.find(start_token, start_index + len(start_token)) # Extract role and content for each turn turns = [] for i in range(len(turn_indices)): turn_start, turn_end = turn_indices[i] turn_data = data[turn_start:turn_end].strip() # Extract role and content from turn data using TURN_TEMPLATE role_start = len(start_token) role_end = turn_data.find("\n", role_start) role = turn_data[role_start:role_end] content_start = role_end + 1 content = turn_data[content_start:] turns.append({'role': role, 'content': content}) return turns @spaces.GPU def generate(text): device = zero.device messages = [ {"role": "user", "content": text} ] encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True) # print(tokenizer.convert_ids_to_tokens(encodeds[0])) model_inputs = encodeds.to(device) model.to(device) generated_ids = model.generate(model_inputs, max_new_tokens=512, do_sample=True, pad_token_id=tokenizer.pad_token_id) decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) predict_answer = deFormat(decoded[0]) return predict_answer @spaces.GPU def beam_search(model, start_token, beam_width=3, max_length=10): sequences = [[start_token, 0.0]] # Initialize with start_token and score 0.0 while len(sequences[0][0]) < max_length: all_candidates = [] for seq, score in sequences: if seq[-1] == '': # Assuming '' is the end token all_candidates.append((seq, score)) continue next_token_probs = model.predict_next(seq) for token, prob in enumerate(next_token_probs): candidate = (seq + [token], score - np.log(prob)) all_candidates.append(candidate) # Order all candidates by score ordered = sorted(all_candidates, key=lambda tup: tup[1]) # Select k best sequences = ordered[:beam_width] return sequences @spaces.GPU def beam_search_generate(text, beam_width=8, max_length=512): device = "cuda" if torch.cuda.is_available() else "cpu" messages = [] messages.append( { "role": "user", "content": text } ) encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True) model_inputs = encodeds.to(device) model.to(device) generated_ids = model.generate( model_inputs, max_new_tokens=max_length, num_beams=beam_width, early_stopping=True, pad_token_id=tokenizer.pad_token_id ) decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) predict_object = deFormat(decoded[0]) messages.append( { "role": "assistent", "content": ''.join(predict_object[1]['content']) } ) return ''.join(predict_object[1]['content']) def yes_man(message, history): return beam_search_generate(message) gr.ChatInterface( yes_man, chatbot=gr.Chatbot(height=650), textbox=gr.Textbox(placeholder="Write your message here ", container=False, scale=7), # slider=gr.Slider(minimum=6, maximum=8, step=1, label="Beam Width"), title="Sur Muy", description="I am your assistant", # examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"], cache_examples=True, undo_btn="Delete Previous", clear_btn="Clear", ).launch()