Spaces:

nafisneehal
/

trialbrain-playground

Sleeping

App Files Files Community

trialbrain-playground / app.py

nafisneehal

Update app.py

30332c0 verified 8 months ago

raw

history blame contribute delete

6.72 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StoppingCriteriaList
	import spaces
	import os
	import json
	from huggingface_hub import login

	# Hugging Face authentication
	HF_TOKEN = os.getenv('Secrets.HF_TOKEN')
	try:
	login(token=HF_TOKEN)
	except Exception as e:
	print(f"Error logging in to Hugging Face: {str(e)}")

	# File to store model links
	MODEL_FILE = "model_links.txt"

	def load_model_links():
	"""Load model links from file"""
	if not os.path.exists(MODEL_FILE):
	# Create default file with some example models
	with open(MODEL_FILE, "w") as f:
	f.write("meta-llama/Llama-2-7b-chat-hf\n")

	with open(MODEL_FILE, "r") as f:
	return [line.strip() for line in f.readlines() if line.strip()]

	class ModelManager:
	def __init__(self):
	self.current_model = None
	self.current_tokenizer = None
	self.current_model_name = None
	# Don't initialize CUDA in __init__
	self.device = None

	def load_model(self, model_name):
	"""Load model and free previous model's memory"""
	if self.current_model is not None:
	del self.current_model
	del self.current_tokenizer
	torch.cuda.empty_cache()

	try:
	self.current_tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.current_model = AutoModelForCausalLM.from_pretrained(
	model_name,
	load_in_4bit=False,
	torch_dtype=torch.bfloat16,
	device_map="auto" # Let the model decide device mapping
	)
	self.current_model_name = model_name
	return f"Successfully loaded model: {model_name}"
	except Exception as e:
	return f"Error loading model: {str(e)}"

	def generate(self, prompt):
	"""Helper method for generation"""
	inputs = self.current_tokenizer(prompt, return_tensors="pt")
	# Let device mapping happen automatically
	return inputs


	# Initialize model manager
	model_manager = ModelManager()

	# Default system message for JSON output
	default_system_message = """You are a helpful AI assistant. You must ALWAYS return your response in valid JSON format.
	Each response should be formatted as follows:
	{
	"response": {
	"main_answer": "Your primary response here",
	"additional_details": "Any additional information or context",
	"confidence": 0.0 to 1.0,
	"tags": ["relevant", "tags", "here"]
	},
	"metadata": {
	"response_type": "type of response",
	"source": "basis of response if applicable"
	}
	}
	Ensure EVERY response strictly follows this JSON structure."""

	@spaces.GPU # This decorator handles the GPU allocation
	def generate_response(model_name, system_instruction, user_input):
	"""Generate response with GPU support and JSON formatting"""
	if model_manager.current_model_name != model_name:
	return json.dumps({"error": "Please load the model first using the 'Load Selected Model' button."}, indent=2)

	if model_manager.current_model is None:
	return json.dumps({"error": "No model loaded. Please load a model first."}, indent=2)

	prompt = f"""### Instruction:
	{system_instruction}
	Remember to ALWAYS format your response as valid JSON.
	### Input:
	{user_input}
	### Response:
	{{"""

	try:
	inputs = model_manager.generate(prompt)

	meta_config = {
	"do_sample": False,
	"temperature": 0.0,
	"max_new_tokens": 512,
	"repetition_penalty": 1.1,
	"use_cache": True,
	"pad_token_id": model_manager.current_tokenizer.eos_token_id,
	"eos_token_id": model_manager.current_tokenizer.eos_token_id
	}
	generation_config = GenerationConfig(**meta_config)

	with torch.no_grad():
	outputs = model_manager.current_model.generate(
	**inputs,
	generation_config=generation_config
	)

	decoded_output = model_manager.current_tokenizer.batch_decode(
	outputs,
	skip_special_tokens=True
	)[0]

	assistant_response = decoded_output.split("### Response:")[-1].strip()

	try:
	last_brace = assistant_response.rindex('}')
	assistant_response = assistant_response[:last_brace + 1]
	json_response = json.loads(assistant_response)
	return json.dumps(json_response, indent=2)
	except (json.JSONDecodeError, ValueError):
	return json.dumps({
	"error": "Failed to generate valid JSON",
	"raw_response": assistant_response
	}, indent=2)

	except Exception as e:
	return json.dumps({
	"error": f"Error generating response: {str(e)}",
	"details": "An unexpected error occurred during generation"
	}, indent=2)




	# Gradio interface setup
	with gr.Blocks() as demo:
	gr.Markdown("# Chat Interface with Model Selection (JSON Output)")

	with gr.Row():
	# Left column for inputs
	with gr.Column():
	model_dropdown = gr.Dropdown(
	choices=load_model_links(),
	label="Select Model",
	info="Choose a model from the list"
	)
	load_button = gr.Button("Load Selected Model")
	model_status = gr.Textbox(label="Model Status")

	system_instruction = gr.Textbox(
	value=default_system_message,
	placeholder="Enter system instruction here...",
	label="System Instruction",
	lines=3
	)
	user_input = gr.Textbox(
	placeholder="Type your message here...",
	label="Your Message",
	lines=3
	)
	submit_btn = gr.Button("Submit")

	# Right column for bot response
	with gr.Column():
	response_display = gr.Textbox(
	label="Bot Response (JSON)",
	interactive=False,
	placeholder="Response will appear here in JSON format.",
	lines=10
	)

	# Event handlers
	load_button.click(
	fn=model_manager.load_model,
	inputs=[model_dropdown],
	outputs=[model_status]
	)

	submit_btn.click(
	fn=generate_response,
	inputs=[model_dropdown, system_instruction, user_input],
	outputs=[response_display]
	)

	# Launch the app
	demo.launch()