nafisneehal's picture
Update app.py
30332c0 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StoppingCriteriaList
import spaces
import os
import json
from huggingface_hub import login
# Hugging Face authentication
HF_TOKEN = os.getenv('Secrets.HF_TOKEN')
try:
login(token=HF_TOKEN)
except Exception as e:
print(f"Error logging in to Hugging Face: {str(e)}")
# File to store model links
MODEL_FILE = "model_links.txt"
def load_model_links():
"""Load model links from file"""
if not os.path.exists(MODEL_FILE):
# Create default file with some example models
with open(MODEL_FILE, "w") as f:
f.write("meta-llama/Llama-2-7b-chat-hf\n")
with open(MODEL_FILE, "r") as f:
return [line.strip() for line in f.readlines() if line.strip()]
class ModelManager:
def __init__(self):
self.current_model = None
self.current_tokenizer = None
self.current_model_name = None
# Don't initialize CUDA in __init__
self.device = None
def load_model(self, model_name):
"""Load model and free previous model's memory"""
if self.current_model is not None:
del self.current_model
del self.current_tokenizer
torch.cuda.empty_cache()
try:
self.current_tokenizer = AutoTokenizer.from_pretrained(model_name)
self.current_model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=False,
torch_dtype=torch.bfloat16,
device_map="auto" # Let the model decide device mapping
)
self.current_model_name = model_name
return f"Successfully loaded model: {model_name}"
except Exception as e:
return f"Error loading model: {str(e)}"
def generate(self, prompt):
"""Helper method for generation"""
inputs = self.current_tokenizer(prompt, return_tensors="pt")
# Let device mapping happen automatically
return inputs
# Initialize model manager
model_manager = ModelManager()
# Default system message for JSON output
default_system_message = """You are a helpful AI assistant. You must ALWAYS return your response in valid JSON format.
Each response should be formatted as follows:
{
"response": {
"main_answer": "Your primary response here",
"additional_details": "Any additional information or context",
"confidence": 0.0 to 1.0,
"tags": ["relevant", "tags", "here"]
},
"metadata": {
"response_type": "type of response",
"source": "basis of response if applicable"
}
}
Ensure EVERY response strictly follows this JSON structure."""
@spaces.GPU # This decorator handles the GPU allocation
def generate_response(model_name, system_instruction, user_input):
"""Generate response with GPU support and JSON formatting"""
if model_manager.current_model_name != model_name:
return json.dumps({"error": "Please load the model first using the 'Load Selected Model' button."}, indent=2)
if model_manager.current_model is None:
return json.dumps({"error": "No model loaded. Please load a model first."}, indent=2)
prompt = f"""### Instruction:
{system_instruction}
Remember to ALWAYS format your response as valid JSON.
### Input:
{user_input}
### Response:
{{"""
try:
inputs = model_manager.generate(prompt)
meta_config = {
"do_sample": False,
"temperature": 0.0,
"max_new_tokens": 512,
"repetition_penalty": 1.1,
"use_cache": True,
"pad_token_id": model_manager.current_tokenizer.eos_token_id,
"eos_token_id": model_manager.current_tokenizer.eos_token_id
}
generation_config = GenerationConfig(**meta_config)
with torch.no_grad():
outputs = model_manager.current_model.generate(
**inputs,
generation_config=generation_config
)
decoded_output = model_manager.current_tokenizer.batch_decode(
outputs,
skip_special_tokens=True
)[0]
assistant_response = decoded_output.split("### Response:")[-1].strip()
try:
last_brace = assistant_response.rindex('}')
assistant_response = assistant_response[:last_brace + 1]
json_response = json.loads(assistant_response)
return json.dumps(json_response, indent=2)
except (json.JSONDecodeError, ValueError):
return json.dumps({
"error": "Failed to generate valid JSON",
"raw_response": assistant_response
}, indent=2)
except Exception as e:
return json.dumps({
"error": f"Error generating response: {str(e)}",
"details": "An unexpected error occurred during generation"
}, indent=2)
# Gradio interface setup
with gr.Blocks() as demo:
gr.Markdown("# Chat Interface with Model Selection (JSON Output)")
with gr.Row():
# Left column for inputs
with gr.Column():
model_dropdown = gr.Dropdown(
choices=load_model_links(),
label="Select Model",
info="Choose a model from the list"
)
load_button = gr.Button("Load Selected Model")
model_status = gr.Textbox(label="Model Status")
system_instruction = gr.Textbox(
value=default_system_message,
placeholder="Enter system instruction here...",
label="System Instruction",
lines=3
)
user_input = gr.Textbox(
placeholder="Type your message here...",
label="Your Message",
lines=3
)
submit_btn = gr.Button("Submit")
# Right column for bot response
with gr.Column():
response_display = gr.Textbox(
label="Bot Response (JSON)",
interactive=False,
placeholder="Response will appear here in JSON format.",
lines=10
)
# Event handlers
load_button.click(
fn=model_manager.load_model,
inputs=[model_dropdown],
outputs=[model_status]
)
submit_btn.click(
fn=generate_response,
inputs=[model_dropdown, system_instruction, user_input],
outputs=[response_display]
)
# Launch the app
demo.launch()