# import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # from peft import PeftModel # Pastikan PEFT terinstal # # Tentukan model dasar dan LoRA weights # base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Model dasar sesuai yang ada di gambar # lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # # Load tokenizer dan base model # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # base_model = AutoModelForCausalLM.from_pretrained(base_model_name) # # Load LoRA model sebagai PeftModel # model = PeftModel.from_pretrained(base_model, lora_model_name) # # Fungsi untuk menghasilkan respons # def generate_response(prompt): # inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # text_streamer = TextStreamer(tokenizer, skip_prompt=True) # outputs = model.generate( # input_ids=inputs["input_ids"], # streamer=text_streamer, # max_new_tokens=128, # use_cache=True, # temperature=1.5, # top_p=0.1 # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # # Interface Gradio # iface = gr.Interface( # fn=generate_response, # inputs="text", # outputs="text", # title="LLAMA-32 Fine-Tuned Model Demo", # description="Enter a prompt and the model will generate a response." # ) # iface.launch() # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # from peft import PeftModel # Pastikan PEFT terinstal # # Tentukan model dasar dan LoRA weights # base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth # lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # # Load tokenizer dan base model # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # base_model = AutoModelForCausalLM.from_pretrained(base_model_name) # # Load LoRA model sebagai PeftModel # model = PeftModel.from_pretrained(base_model, lora_model_name) # # Set pad_token_id jika belum diatur # if model.config.pad_token_id is None: # model.config.pad_token_id = model.config.eos_token_id # # Fungsi untuk menghasilkan respons # def generate_response(prompt): # # Tokenisasi input dengan `attention_mask` # inputs = tokenizer(prompt, return_tensors="pt", padding=True) # input_ids = inputs["input_ids"].to("cpu") # attention_mask = inputs["attention_mask"].to("cpu") # # Streaming teks keluaran # text_streamer = TextStreamer(tokenizer, skip_prompt=True) # # Generasi teks dengan menggunakan `attention_mask` dan `pad_token_id` # outputs = model.generate( # input_ids=input_ids, # attention_mask=attention_mask, # streamer=text_streamer, # max_new_tokens=128, # use_cache=True, # temperature=1.5, # top_p=0.1 # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # # Interface Gradio # iface = gr.Interface( # fn=generate_response, # inputs="text", # outputs="text", # title="LLAMA-32 Fine-Tuned Model Demo", # description="Enter a prompt and the model will generate a response." # ) # iface.launch() # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # from peft import PeftModel # Pastikan PEFT terinstal # # Tentukan model dasar dan LoRA weights # base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth # lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # # Load tokenizer dan base model # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # base_model = AutoModelForCausalLM.from_pretrained(base_model_name) # # Load LoRA model sebagai PeftModel # model = PeftModel.from_pretrained(base_model, lora_model_name) # # Set pad_token_id jika belum diatur # if model.config.pad_token_id is None: # model.config.pad_token_id = model.config.eos_token_id # # Fungsi untuk menghasilkan respons # def generate_response(prompt): # # Tokenisasi input secara manual tanpa apply_chat_template # inputs = tokenizer(prompt, return_tensors="pt", padding=True) # input_ids = inputs["input_ids"].to("cpu") # Menggunakan CPU # attention_mask = inputs["attention_mask"].to("cpu") # Attention mask untuk input # # Streaming teks keluaran # text_streamer = TextStreamer(tokenizer, skip_prompt=True) # # Konfigurasi generate sesuai referensi # outputs = model.generate( # input_ids=input_ids, # attention_mask=attention_mask, # streamer=text_streamer, # max_new_tokens=128, # use_cache=True, # temperature=0.01, # Mengunci jawaban pada respons yang konsisten # top_p=1.0, # Tidak membatasi token dalam probabilitas kumulatif # top_k=1 # Memilih hanya token dengan probabilitas tertinggi # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # # Interface Gradio # iface = gr.Interface( # fn=generate_response, # inputs="text", # outputs="text", # title="LLAMA-32 Fine-Tuned Model Demo", # description="Enter a prompt and the model will generate a response." # ) # iface.launch() import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from unsloth import FastLanguageModel # Jika ini adalah custom model dari unsloth import os # Tentukan model dan tokenizer base_model_name = "unsloth/Llama-3.2-3B-Instruct" finetuned_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # Cek apakah kita memiliki GPU atau hanya CPU device = "cuda" if torch.cuda.is_available() else "cpu" # Load tokenizer dan model tokenizer = AutoTokenizer.from_pretrained(base_model_name) model = AutoModelForCausalLM.from_pretrained(finetuned_model_name) model = model.to(device) FastLanguageModel.for_inference(model) # Fungsi untuk menghasilkan respons berdasarkan pesan dari pengguna def generate_response(input_text): # Format pesan untuk model messages = [ {"role": "user", "content": input_text} ] # Aplikasikan template chat khusus untuk input inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ).to(device) # Gunakan TextStreamer untuk output from transformers import TextStreamer text_streamer = TextStreamer(tokenizer, skip_prompt=True) # Generate response outputs = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=128, temperature=0.01, top_p=1.0, top_k=1, use_cache=True ) # Decode hasil generate menjadi teks response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response.strip() # Antarmuka Gradio iface = gr.Interface( fn=generate_response, inputs="text", outputs="text", title="Chatbot LLaMA Finetuned" ) # Jalankan aplikasi Gradio iface.launch()