# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# from peft import PeftModel  # Pastikan PEFT terinstal

# # Tentukan model dasar dan LoRA weights
# base_model_name = "unsloth/Llama-3.2-3B-Instruct"  # Model dasar sesuai yang ada di gambar
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3"

# # Load tokenizer dan base model
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# # Load LoRA model sebagai PeftModel
# model = PeftModel.from_pretrained(base_model, lora_model_name)

# # Fungsi untuk menghasilkan respons
# def generate_response(prompt):
#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
#     text_streamer = TextStreamer(tokenizer, skip_prompt=True)
#     outputs = model.generate(
#         input_ids=inputs["input_ids"], 
#         streamer=text_streamer, 
#         max_new_tokens=128, 
#         use_cache=True, 
#         temperature=1.5, 
#         top_p=0.1
#     )
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return response

# # Interface Gradio
# iface = gr.Interface(
#     fn=generate_response,
#     inputs="text",
#     outputs="text",
#     title="LLAMA-32 Fine-Tuned Model Demo",
#     description="Enter a prompt and the model will generate a response."
# )

# iface.launch()

# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# from peft import PeftModel  # Pastikan PEFT terinstal

# # Tentukan model dasar dan LoRA weights
# base_model_name = "unsloth/Llama-3.2-3B-Instruct"  # Base model dari unsloth
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3"

# # Load tokenizer dan base model
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# # Load LoRA model sebagai PeftModel
# model = PeftModel.from_pretrained(base_model, lora_model_name)

# # Set pad_token_id jika belum diatur
# if model.config.pad_token_id is None:
#     model.config.pad_token_id = model.config.eos_token_id

# # Fungsi untuk menghasilkan respons
# def generate_response(prompt):
#     # Tokenisasi input dengan `attention_mask`
#     inputs = tokenizer(prompt, return_tensors="pt", padding=True)
#     input_ids = inputs["input_ids"].to("cpu")
#     attention_mask = inputs["attention_mask"].to("cpu")
    
#     # Streaming teks keluaran
#     text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    
#     # Generasi teks dengan menggunakan `attention_mask` dan `pad_token_id`
#     outputs = model.generate(
#         input_ids=input_ids,
#         attention_mask=attention_mask,
#         streamer=text_streamer,
#         max_new_tokens=128,
#         use_cache=True,
#         temperature=1.5,
#         top_p=0.1
#     )
    
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return response

# # Interface Gradio
# iface = gr.Interface(
#     fn=generate_response,
#     inputs="text",
#     outputs="text",
#     title="LLAMA-32 Fine-Tuned Model Demo",
#     description="Enter a prompt and the model will generate a response."
# )

# iface.launch()

# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
# from peft import PeftModel  # Pastikan PEFT terinstal

# # Tentukan model dasar dan LoRA weights
# base_model_name = "unsloth/Llama-3.2-3B-Instruct"  # Base model dari unsloth
# lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3"

# # Load tokenizer dan base model
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# # Load LoRA model sebagai PeftModel
# model = PeftModel.from_pretrained(base_model, lora_model_name)

# # Set pad_token_id jika belum diatur
# if model.config.pad_token_id is None:
#     model.config.pad_token_id = model.config.eos_token_id

# # Fungsi untuk menghasilkan respons
# def generate_response(prompt):
#     # Tokenisasi input secara manual tanpa apply_chat_template
#     inputs = tokenizer(prompt, return_tensors="pt", padding=True)
#     input_ids = inputs["input_ids"].to("cpu")  # Menggunakan CPU
#     attention_mask = inputs["attention_mask"].to("cpu")  # Attention mask untuk input
    
#     # Streaming teks keluaran
#     text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    
#     # Konfigurasi generate sesuai referensi
#     outputs = model.generate(
#         input_ids=input_ids,
#         attention_mask=attention_mask,
#         streamer=text_streamer,
#         max_new_tokens=128,
#         use_cache=True,
#         temperature=0.01,  # Mengunci jawaban pada respons yang konsisten
#         top_p=1.0,         # Tidak membatasi token dalam probabilitas kumulatif
#         top_k=1            # Memilih hanya token dengan probabilitas tertinggi
#     )
    
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return response

# # Interface Gradio
# iface = gr.Interface(
#     fn=generate_response,
#     inputs="text",
#     outputs="text",
#     title="LLAMA-32 Fine-Tuned Model Demo",
#     description="Enter a prompt and the model will generate a response."
# )

# iface.launch()

from transformers import pipeline
import gradio as gr

# Load the pipeline once, outside of the function
pipe = pipeline("text-generation", model="nicolaakmal/llama32-lora-finetuned-v3-fp16")

def generate_response(inputs):
    messages = [
        {"role": "user", "content": inputs},
    ]
    # Generate the response using the pipeline
    response = pipe(messages)
    # Extract the generated text from the response and clean it up if necessary
    generated_text = response[0]["generated_text"] if response else "No response generated"
    return generated_text.strip()  # Removing any unwanted whitespace

# Antarmuka Gradio
iface = gr.Interface(
    fn=generate_response,
    inputs="text",
    outputs="text",
    title="Chatbot LLaMA Finetuned",
    description="A fine-tuned LLaMA model. Enter your questions or prompts below.",
    theme="default"  # Optional: set a theme if desired, such as 'huggingface' or 'dark'
)

# Jalankan aplikasi Gradio
iface.launch()