Spaces:

hackergeek
/

CBT

Runtime error

CBT

File size: 1,577 Bytes

f1ff7a7
dc27180
 
f1ff7a7
4098b12
dc27180
 
4098b12
 
 
dc27180
 
 
f1ff7a7
dc27180
 
 
 
 
 
 
 
 
 
f1ff7a7
dc27180
 
f086418
dc27180
 
4098b12
f086418
4098b12
dc27180
f086418
dc27180
4098b12
dc27180
 
 
 
f1ff7a7
4098b12

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model with CPU optimizations
model = AutoModelForCausalLM.from_pretrained(
    "hackergeek/gemma-finetuned",
    torch_dtype=torch.float32,
    device_map="cpu",
    low_cpu_mem_usage=True  # Now works with Accelerate installed
)
tokenizer = AutoTokenizer.from_pretrained("hackergeek/gemma-finetuned")
tokenizer.pad_token = tokenizer.eos_token

def format_prompt(message, history):
    """Format the prompt with conversation history"""
    system_prompt = "You are a knowledgeable space expert assistant. Answer questions about astronomy, space exploration, and related topics in a clear and engaging manner."
    prompt = f"<system>{system_prompt}</system>\n"
    
    for user_msg, bot_msg in history:
        prompt += f"<user>{user_msg}</user>\n<assistant>{bot_msg}</assistant>\n"
    
    prompt += f"<user>{message}</user>\n<assistant>"
    return prompt

def respond(message, history):
    full_prompt = format_prompt(message, history)
    inputs = tokenizer(full_prompt, return_tensors="pt", add_special_tokens=False)
    
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=256,  # Reduced for CPU safety
        temperature=0.7,
        top_p=0.85,
        repetition_penalty=1.1,
        do_sample=True
    )
    
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response

# ... (rest of the Gradio interface code remains the same)