File size: 2,065 Bytes
e05a359
 
 
 
 
5618484
 
 
6d13369
2c29d2c
 
6d13369
 
 
 
2c29d2c
6d13369
2c29d2c
6d13369
 
 
 
 
 
 
 
2c29d2c
 
6d13369
 
e05a359
 
 
 
 
 
 
 
6d13369
e05a359
 
75d8dd1
6d13369
e05a359
75d8dd1
e05a359
6d13369
2c29d2c
 
26272b1
edc2e4f
2c29d2c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr  
from transformers import pipeline
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
#from llama_cpp import Llama  
  
# Load the Llama model  
#llm = Llama.from_pretrained(  
 #   repo_id="GSridhar1982/QA_Llama31_Quantized_GGUF",  
 #   filename="QA_llama31_unsloth.Q4_K_M.gguf",  
#)  
  
#def generate_response(user_input):  
    # Perform inference  
#    response = llm.create_chat_completion(  
#        messages=[  
#            {  
#                "role": "user",  
#                "content": user_input  
 #           }  
 #       ]  
 #   )  
      
    # Extract the model's reply  
  #  model_reply = response['choices'][0]['message']['content']  
   # return model_reply  
def generate_answer(user_input):
    model = AutoPeftModelForCausalLM.from_pretrained(
        "GSridhar1982/AIML_QA_Llama31_FineTuned_UsingLora", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("GSridhar1982/AIML_QA_Llama31_FineTuned_UsingLora")
    # Create a text generation pipeline
    generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

    # Generate predictions on the test dataset
    # Access the input column of the dataset using the column name
    predictions = generator(user_input)[0]['generated_text']

    # Extract the generated text from the pipeline output
    #predictions = [pred[0]['generated_text'] for pred in predictions]
    return predictions

# Create a Gradio interface  
iface = gr.Interface(  
    fn=generate_answer,  
    inputs="textbox",  
    outputs="text",  
    title="AIML Q&A Chatbot",  
    description="Ask questions related to AIML and get answers from the fine-tuned Llama model."  
)  
  
# Launch the app  
iface.launch()