import torch import gradio as gr from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") peft_model_id = "kimmeoungjun/qlora-koalpaca2" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path) model = PeftModel.from_pretrained(model, peft_model_id).to(device) tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) def generate(q): inputs = tokenizer(f"### 질문: {q}\n\n### 답변:", return_tensors='pt', return_token_type_ids=False) outputs = model.generate( **{k: v.to(device) for k, v in inputs.items()}, max_new_tokens=256, do_sample=True, eos_token_id=2, ) result = tokenizer.decode(outputs[0]) answer_idx = result.find("### 답변:") answer = result[answer_idx + 7:].strip() return answer gr.Interface(generate, "text", "text").launch(share=True)