from unsloth import FastLanguageModel import torch max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. # 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", "unsloth/llama-2-13b-bnb-4bit", "unsloth/codellama-34b-bnb-4bit", "unsloth/tinyllama-bnb-4bit", "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster! "unsloth/gemma-2b-bnb-4bit", ] # More models at https://huggingface.co/unsloth model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) import gradio as gr # Function to handle user query and return response def chatbot_response(user_query): if True: from unsloth import FastLanguageModel from transformers import TextStreamer model, tokenizer = FastLanguageModel.from_pretrained( model_name = "/content/drive/MyDrive/Colab Notebooks/lora_model", # YOUR MODEL YOU USED FOR TRAINING max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, ) FastLanguageModel.for_inference(model) alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" inputs = tokenizer( [ alpaca_prompt.format( "Category,Instruction General Response,'Answer the user’s query thoroughly and accurately, ensuring no details or points are omitted. Always recognize that 'AEC' refers to 'Assam Engineering College,' and vice versa, and use this understanding to provide clear, context-aware responses.' Formatting,'Structure the output to be attractive, engaging, and professional, using proper formatting. Break the response into multiple paragraphs or sections if necessary to improve readability and organization.' Use of Lists,'For queries that involve enumerations, options, or multiple steps, use bullet points or numbered lists to present the information clearly and concisely. For example: - When listing departments or facilities. - When explaining procedures or step-by-step guides. - When summarizing key features or FAQs.' Tone,'Maintain a friendly, informative tone, and deliver complete, standard answers to meet the user's expectations", # instruction user_query, # input "", # output - leave this blank for generation! ) ], return_tensors = "pt").to("cuda") text_streamer = TextStreamer(tokenizer) # Generate the response response = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) # Decode the response decoded_output = tokenizer.batch_decode(response, skip_special_tokens=False)[0] # Extract the desired portion # Find the start of the Response section response_start = decoded_output.find("### Response:") + len("### Response:") # Extract only the response part final_response = decoded_output[response_start:].strip() # Input query print("User Query:", user_query) # Just for debugging, can be removed # --- Your model inference logic goes here --- # Example: Replace the following line with your model's response model_response = f"{final_response}" # --------------------------------------------- # Output response print("Model Response:", final_response) # Just for debugging, can be removed return model_response # Gradio Interface interface = gr.Interface( fn=chatbot_response, # Function for processing user input inputs=gr.Textbox( label="Enter your query:", # Label for the input box placeholder="Type something...", # Placeholder text ), outputs=gr.Textbox(label="Response:"), # Output box for model response title="Simple Chatbot", description="This is a simple chatbot interface. Type your query and get a response.", ) # Launch the Gradio app interface.launch()