nicolaakmal's picture
Update app.py
c35d40d verified
raw
history blame
5.76 kB
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# from peft import PeftModel # Pastikan PEFT terinstal
# # Tentukan model dasar dan LoRA weights
# base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Model dasar sesuai yang ada di gambar
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3"
# # Load tokenizer dan base model
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
# # Load LoRA model sebagai PeftModel
# model = PeftModel.from_pretrained(base_model, lora_model_name)
# # Fungsi untuk menghasilkan respons
# def generate_response(prompt):
# inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
# outputs = model.generate(
# input_ids=inputs["input_ids"],
# streamer=text_streamer,
# max_new_tokens=128,
# use_cache=True,
# temperature=1.5,
# top_p=0.1
# )
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return response
# # Interface Gradio
# iface = gr.Interface(
# fn=generate_response,
# inputs="text",
# outputs="text",
# title="LLAMA-32 Fine-Tuned Model Demo",
# description="Enter a prompt and the model will generate a response."
# )
# iface.launch()
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# from peft import PeftModel # Pastikan PEFT terinstal
# # Tentukan model dasar dan LoRA weights
# base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3"
# # Load tokenizer dan base model
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
# # Load LoRA model sebagai PeftModel
# model = PeftModel.from_pretrained(base_model, lora_model_name)
# # Set pad_token_id jika belum diatur
# if model.config.pad_token_id is None:
# model.config.pad_token_id = model.config.eos_token_id
# # Fungsi untuk menghasilkan respons
# def generate_response(prompt):
# # Tokenisasi input dengan `attention_mask`
# inputs = tokenizer(prompt, return_tensors="pt", padding=True)
# input_ids = inputs["input_ids"].to("cpu")
# attention_mask = inputs["attention_mask"].to("cpu")
# # Streaming teks keluaran
# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
# # Generasi teks dengan menggunakan `attention_mask` dan `pad_token_id`
# outputs = model.generate(
# input_ids=input_ids,
# attention_mask=attention_mask,
# streamer=text_streamer,
# max_new_tokens=128,
# use_cache=True,
# temperature=1.5,
# top_p=0.1
# )
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return response
# # Interface Gradio
# iface = gr.Interface(
# fn=generate_response,
# inputs="text",
# outputs="text",
# title="LLAMA-32 Fine-Tuned Model Demo",
# description="Enter a prompt and the model will generate a response."
# )
# iface.launch()
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# from peft import PeftModel # Pastikan PEFT terinstal
# # Tentukan model dasar dan LoRA weights
# base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3"
# # Load tokenizer dan base model
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
# # Load LoRA model sebagai PeftModel
# model = PeftModel.from_pretrained(base_model, lora_model_name)
# # Set pad_token_id jika belum diatur
# if model.config.pad_token_id is None:
# model.config.pad_token_id = model.config.eos_token_id
# # Fungsi untuk menghasilkan respons
# def generate_response(prompt):
# # Tokenisasi input secara manual tanpa apply_chat_template
# inputs = tokenizer(prompt, return_tensors="pt", padding=True)
# input_ids = inputs["input_ids"].to("cpu") # Menggunakan CPU
# attention_mask = inputs["attention_mask"].to("cpu") # Attention mask untuk input
# # Streaming teks keluaran
# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
# # Konfigurasi generate sesuai referensi
# outputs = model.generate(
# input_ids=input_ids,
# attention_mask=attention_mask,
# streamer=text_streamer,
# max_new_tokens=128,
# use_cache=True,
# temperature=0.01, # Mengunci jawaban pada respons yang konsisten
# top_p=1.0, # Tidak membatasi token dalam probabilitas kumulatif
# top_k=1 # Memilih hanya token dengan probabilitas tertinggi
# )
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return response
# # Interface Gradio
# iface = gr.Interface(
# fn=generate_response,
# inputs="text",
# outputs="text",
# title="LLAMA-32 Fine-Tuned Model Demo",
# description="Enter a prompt and the model will generate a response."
# )
# iface.launch()
from transformers import pipeline
import gradio as gr
def generate_response(inputs):
messages = [
{"role": "user", "content": inputs},
]
pipe = pipeline("text-generation", model="nicolaakmal/llama32-lora-finetuned-v3-fp16")
return pipe(messages)
# Antarmuka Gradio
iface = gr.Interface(
fn=generate_response,
inputs="text",
outputs="text",
title="Chatbot LLaMA Finetuned"
)
# Jalankan aplikasi Gradio
iface.launch()