Spaces:
Running
Running
import os | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline,BitsAndBytesConfig | |
import gradio as gr | |
from google.colab import userdata | |
# Set up the model and tokenizer | |
MODEL_ID = "microsoft/Phi-3.5-mini-instruct" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
os.environ['HF_TOKEN']=os.environ.get('HF_TOKEN') | |
os.environ['HUGGINGFACEHUB_API_TOKEN']=os.environ.get('HF_TOKEN') | |
# Configure quantization for CPU | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_compute_dtype=torch.bfloat16, | |
bnb_4bit_use_double_quant=True, | |
bnb_4bit_quant_type="nf4" | |
) | |
# Load the model with quantization | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
quantization_config=quantization_config | |
) | |
# Set the device to CPU | |
device = "cpu" | |
# model.to(device) | |
# Define the function for the Gradio interface | |
def chat_with_phi(message): | |
conversation = [{"role": "user", "content": message}] | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
) | |
response = pipe(conversation) | |
return response[0]['generated_text'] | |
# Set up the Gradio interface | |
app = gr.Interface( | |
fn=chat_with_phi, | |
inputs=gr.Textbox(label="Type your message:"), | |
outputs=gr.Textbox(label="Phi 3.5 Responds:"), | |
title="Phi 3.5 Text Chat", | |
description="Chat with Phi 3.5 model. Ask anything!", | |
theme="huggingface" | |
) | |
# Launch the app | |
app.launch(debug=True) | |