Spaces:

drmasad
/

HAH-2024-v0.1

Runtime error

HAH-2024-v0.1

File size: 3,346 Bytes

# Import necessary libraries
import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from openai import OpenAI
import os
import torch
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import login

# Initialize the OpenAI client
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1",
    api_key=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
)

api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if api_token:
    login(token=api_token)
else:
    print("API token is not set in the environment variables.")

# Define model links
model_links = {
    "HAH-2024-v0.1": "drmasad/HAH-2024-v0.11"
}

# Set selected model
selected_model = "HAH-2024-v0.1"

# Sidebar setup
temp_values = st.sidebar.slider("Select a temperature value", 0.0, 1.0, (0.5))
def reset_conversation():
    st.session_state.conversation = []
    st.session_state.messages = []

st.sidebar.button("Reset Chat", on_click=reset_conversation)
st.sidebar.write(f"You're now chatting with **{selected_model}**")
st.sidebar.image("https://www.hmgaihub.com/untitled.png")

def load_model(selected_model_name):
    model_name = model_links[selected_model_name]

    # Set a specific device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load model with device mapping
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=False,
        llm_int8_enable_fp32_cpu_offload=True,
    )

    device_map = {"": device}  # Default device for all components

    # Load model with proper device mapping
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map=device_map,  # Assign device
        trust_remote_code=True,
    )

    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)

    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"],
    )

    model = get_peft_model(model, peft_config)

    tokenizer = AutoTokenizer.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2", trust_remote_code=True
    )

    return model, tokenizer


# Load model and tokenizer
model, tokenizer = load_model(selected_model)

# Chat application logic
if "messages" not in st.session_state:
    st.session_state.messages = []

for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

if prompt := st.chat_input("Ask me anything about diabetes"):
    with st.chat_message("user"):
        st.markdown(prompt)
    st.session_state.messages.append({"role": "user", "content": prompt})

    with st.chat_message("assistant"):
        result = pipeline(
            task="text-generation",
            model=model,
            tokenizer=tokenizer,
            max_length=1024,
            temperature=temp_values
        )(prompt)
        response = result[0]['generated_text']
        st.markdown(response)
    st.session_state.messages.append({"role": "assistant", "content": response})