# import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # from peft import PeftModel # Pastikan PEFT terinstal # # Tentukan model dasar dan LoRA weights # base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Model dasar sesuai yang ada di gambar # lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # # Load tokenizer dan base model # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # base_model = AutoModelForCausalLM.from_pretrained(base_model_name) # # Load LoRA model sebagai PeftModel # model = PeftModel.from_pretrained(base_model, lora_model_name) # # Fungsi untuk menghasilkan respons # def generate_response(prompt): # inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # text_streamer = TextStreamer(tokenizer, skip_prompt=True) # outputs = model.generate( # input_ids=inputs["input_ids"], # streamer=text_streamer, # max_new_tokens=128, # use_cache=True, # temperature=1.5, # top_p=0.1 # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # # Interface Gradio # iface = gr.Interface( # fn=generate_response, # inputs="text", # outputs="text", # title="LLAMA-32 Fine-Tuned Model Demo", # description="Enter a prompt and the model will generate a response." # ) # iface.launch() # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # from peft import PeftModel # Pastikan PEFT terinstal # # Tentukan model dasar dan LoRA weights # base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth # lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # # Load tokenizer dan base model # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # base_model = AutoModelForCausalLM.from_pretrained(base_model_name) # # Load LoRA model sebagai PeftModel # model = PeftModel.from_pretrained(base_model, lora_model_name) # # Set pad_token_id jika belum diatur # if model.config.pad_token_id is None: # model.config.pad_token_id = model.config.eos_token_id # # Fungsi untuk menghasilkan respons # def generate_response(prompt): # # Tokenisasi input dengan `attention_mask` # inputs = tokenizer(prompt, return_tensors="pt", padding=True) # input_ids = inputs["input_ids"].to("cpu") # attention_mask = inputs["attention_mask"].to("cpu") # # Streaming teks keluaran # text_streamer = TextStreamer(tokenizer, skip_prompt=True) # # Generasi teks dengan menggunakan `attention_mask` dan `pad_token_id` # outputs = model.generate( # input_ids=input_ids, # attention_mask=attention_mask, # streamer=text_streamer, # max_new_tokens=128, # use_cache=True, # temperature=1.5, # top_p=0.1 # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # # Interface Gradio # iface = gr.Interface( # fn=generate_response, # inputs="text", # outputs="text", # title="LLAMA-32 Fine-Tuned Model Demo", # description="Enter a prompt and the model will generate a response." # ) # iface.launch() # import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # from peft import PeftModel # Pastikan PEFT terinstal # # Tentukan model dasar dan LoRA weights # base_model_name = "unsloth/Llama-3.2-3B-Instruct" # Base model dari unsloth # lora_model_name = "nicolaakmal/llama32-lora-finetuned-v3" # # Load tokenizer dan base model # tokenizer = AutoTokenizer.from_pretrained(base_model_name) # base_model = AutoModelForCausalLM.from_pretrained(base_model_name) # # Load LoRA model sebagai PeftModel # model = PeftModel.from_pretrained(base_model, lora_model_name) # # Set pad_token_id jika belum diatur # if model.config.pad_token_id is None: # model.config.pad_token_id = model.config.eos_token_id # # Fungsi untuk menghasilkan respons # def generate_response(prompt): # # Tokenisasi input secara manual tanpa apply_chat_template # inputs = tokenizer(prompt, return_tensors="pt", padding=True) # input_ids = inputs["input_ids"].to("cpu") # Menggunakan CPU # attention_mask = inputs["attention_mask"].to("cpu") # Attention mask untuk input # # Streaming teks keluaran # text_streamer = TextStreamer(tokenizer, skip_prompt=True) # # Konfigurasi generate sesuai referensi # outputs = model.generate( # input_ids=input_ids, # attention_mask=attention_mask, # streamer=text_streamer, # max_new_tokens=128, # use_cache=True, # temperature=0.01, # Mengunci jawaban pada respons yang konsisten # top_p=1.0, # Tidak membatasi token dalam probabilitas kumulatif # top_k=1 # Memilih hanya token dengan probabilitas tertinggi # ) # response = tokenizer.decode(outputs[0], skip_special_tokens=True) # return response # # Interface Gradio # iface = gr.Interface( # fn=generate_response, # inputs="text", # outputs="text", # title="LLAMA-32 Fine-Tuned Model Demo", # description="Enter a prompt and the model will generate a response." # ) # iface.launch() from transformers import pipeline import gradio as gr # Load the pipeline once, outside of the function pipe = pipeline("text-generation", model="nicolaakmal/llama32-lora-finetuned-v3-fp16") def generate_response(inputs): messages = [ {"role": "user", "content": inputs}, ] # Generate the response using the pipeline response = pipe(messages) # Extract the generated text from the response and clean it up if necessary generated_text = response[0]["generated_text"] if response else "No response generated" return generated_text.strip() # Removing any unwanted whitespace # Antarmuka Gradio iface = gr.Interface( fn=generate_response, inputs="text", outputs="text", title="Chatbot LLaMA Finetuned", description="A fine-tuned LLaMA model. Enter your questions or prompts below.", theme="default" # Optional: set a theme if desired, such as 'huggingface' or 'dark' ) # Jalankan aplikasi Gradio iface.launch()