import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline,BitsAndBytesConfig import gradio as gr from google.colab import userdata # Set up the model and tokenizer MODEL_ID = "microsoft/Phi-3.5-mini-instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) os.environ['HF_TOKEN']=os.environ.get('HF_TOKEN') os.environ['HUGGINGFACEHUB_API_TOKEN']=os.environ.get('HF_TOKEN') # Configure quantization for CPU quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) # Load the model with quantization model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) # Set the device to CPU device = "cpu" # model.to(device) # Define the function for the Gradio interface def chat_with_phi(message): conversation = [{"role": "user", "content": message}] pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) response = pipe(conversation) return response[0]['generated_text'] # Set up the Gradio interface app = gr.Interface( fn=chat_with_phi, inputs=gr.Textbox(label="Type your message:"), outputs=gr.Textbox(label="Phi 3.5 Responds:"), title="Phi 3.5 Text Chat", description="Chat with Phi 3.5 model. Ask anything!", theme="huggingface" ) # Launch the app app.launch(debug=True)