# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer | |
# from peft import PeftModel # Pastikan PEFT terinstal | |
# # Tentukan model dasar dan LoRA weights | |
# base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Model dasar sesuai yang ada di gambar | |
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" | |
# # Load tokenizer dan base model | |
# tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name) | |
# # Load LoRA model sebagai PeftModel | |
# model = PeftModel.from_pretrained(base_model, lora_model_name) | |
# # Fungsi untuk menghasilkan respons | |
# def generate_response(prompt): | |
# inputs = tokenizer(prompt, return_tensors="pt").to("cuda") | |
# text_streamer = TextStreamer(tokenizer, skip_prompt=True) | |
# outputs = model.generate( | |
# input_ids=inputs["input_ids"], | |
# streamer=text_streamer, | |
# max_new_tokens=128, | |
# use_cache=True, | |
# temperature=1.5, | |
# top_p=0.1 | |
# ) | |
# response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# return response | |
# # Interface Gradio | |
# iface = gr.Interface( | |
# fn=generate_response, | |
# inputs="text", | |
# outputs="text", | |
# title="LLAMA-32 Fine-Tuned Model Demo", | |
# description="Enter a prompt and the model will generate a response." | |
# ) | |
# iface.launch() | |
# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer | |
# from peft import PeftModel # Pastikan PEFT terinstal | |
# # Tentukan model dasar dan LoRA weights | |
# base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth | |
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" | |
# # Load tokenizer dan base model | |
# tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name) | |
# # Load LoRA model sebagai PeftModel | |
# model = PeftModel.from_pretrained(base_model, lora_model_name) | |
# # Set pad_token_id jika belum diatur | |
# if model.config.pad_token_id is None: | |
# model.config.pad_token_id = model.config.eos_token_id | |
# # Fungsi untuk menghasilkan respons | |
# def generate_response(prompt): | |
# # Tokenisasi input dengan `attention_mask` | |
# inputs = tokenizer(prompt, return_tensors="pt", padding=True) | |
# input_ids = inputs["input_ids"].to("cpu") | |
# attention_mask = inputs["attention_mask"].to("cpu") | |
# # Streaming teks keluaran | |
# text_streamer = TextStreamer(tokenizer, skip_prompt=True) | |
# # Generasi teks dengan menggunakan `attention_mask` dan `pad_token_id` | |
# outputs = model.generate( | |
# input_ids=input_ids, | |
# attention_mask=attention_mask, | |
# streamer=text_streamer, | |
# max_new_tokens=128, | |
# use_cache=True, | |
# temperature=1.5, | |
# top_p=0.1 | |
# ) | |
# response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# return response | |
# # Interface Gradio | |
# iface = gr.Interface( | |
# fn=generate_response, | |
# inputs="text", | |
# outputs="text", | |
# title="LLAMA-32 Fine-Tuned Model Demo", | |
# description="Enter a prompt and the model will generate a response." | |
# ) | |
# iface.launch() | |
# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer | |
# from peft import PeftModel # Pastikan PEFT terinstal | |
# # Tentukan model dasar dan LoRA weights | |
# base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth | |
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" | |
# # Load tokenizer dan base model | |
# tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name) | |
# # Load LoRA model sebagai PeftModel | |
# model = PeftModel.from_pretrained(base_model, lora_model_name) | |
# # Set pad_token_id jika belum diatur | |
# if model.config.pad_token_id is None: | |
# model.config.pad_token_id = model.config.eos_token_id | |
# # Fungsi untuk menghasilkan respons | |
# def generate_response(prompt): | |
# # Tokenisasi input secara manual tanpa apply_chat_template | |
# inputs = tokenizer(prompt, return_tensors="pt", padding=True) | |
# input_ids = inputs["input_ids"].to("cpu") # Menggunakan CPU | |
# attention_mask = inputs["attention_mask"].to("cpu") # Attention mask untuk input | |
# # Streaming teks keluaran | |
# text_streamer = TextStreamer(tokenizer, skip_prompt=True) | |
# # Konfigurasi generate sesuai referensi | |
# outputs = model.generate( | |
# input_ids=input_ids, | |
# attention_mask=attention_mask, | |
# streamer=text_streamer, | |
# max_new_tokens=128, | |
# use_cache=True, | |
# temperature=0.01, # Mengunci jawaban pada respons yang konsisten | |
# top_p=1.0, # Tidak membatasi token dalam probabilitas kumulatif | |
# top_k=1 # Memilih hanya token dengan probabilitas tertinggi | |
# ) | |
# response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# return response | |
# # Interface Gradio | |
# iface = gr.Interface( | |
# fn=generate_response, | |
# inputs="text", | |
# outputs="text", | |
# title="LLAMA-32 Fine-Tuned Model Demo", | |
# description="Enter a prompt and the model will generate a response." | |
# ) | |
# iface.launch() | |
from transformers import pipeline | |
import gradio as gr | |
def generate_response(inputs): | |
messages = [ | |
{"role": "user", "content": inputs}, | |
] | |
pipe = pipeline("text-generation", model="nicolaakmal/llama32-lora-finetuned-v3-fp16") | |
return pipe(messages) | |
# Antarmuka Gradio | |
iface = gr.Interface( | |
fn=generate_response, | |
inputs="text", | |
outputs="text", | |
title="Chatbot LLaMA Finetuned" | |
) | |
# Jalankan aplikasi Gradio | |
iface.launch() | |