AIML_QA_Demo

Sleeping

App Files Files Community

AIML_QA_Demo / app.py

GSridhar1982

Update app.py

75d8dd1 verified 9 months ago

raw

history blame

2.07 kB

	import gradio as gr
	from transformers import pipeline
	from peft import AutoPeftModelForCausalLM
	from transformers import AutoTokenizer

	max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
	dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
	load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
	#from llama_cpp import Llama

	# Load the Llama model
	#llm = Llama.from_pretrained(
	# repo_id="GSridhar1982/QA_Llama31_Quantized_GGUF",
	# filename="QA_llama31_unsloth.Q4_K_M.gguf",
	#)

	#def generate_response(user_input):
	# Perform inference
	# response = llm.create_chat_completion(
	# messages=[
	# {
	# "role": "user",
	# "content": user_input
	# }
	# ]
	# )

	# Extract the model's reply
	# model_reply = response['choices'][0]['message']['content']
	# return model_reply
	def generate_answer(user_input):
	model = AutoPeftModelForCausalLM.from_pretrained(
	"GSridhar1982/AIML_QA_Llama31_FineTuned_UsingLora", # YOUR MODEL YOU USED FOR TRAINING
	load_in_4bit = load_in_4bit,
	)
	tokenizer = AutoTokenizer.from_pretrained("GSridhar1982/AIML_QA_Llama31_FineTuned_UsingLora")
	# Create a text generation pipeline
	generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

	# Generate predictions on the test dataset
	# Access the input column of the dataset using the column name
	predictions = generator(user_input)[0]['generated_text']

	# Extract the generated text from the pipeline output
	#predictions = [pred[0]['generated_text'] for pred in predictions]
	return predictions

	# Create a Gradio interface
	iface = gr.Interface(
	fn=generate_answer,
	inputs="textbox",
	outputs="text",
	title="AIML Q&A Chatbot",
	description="Ask questions related to AIML and get answers from the fine-tuned Llama model."
	)

	# Launch the app
	iface.launch()