File size: 3,054 Bytes
ee6f18e
 
 
 
 
 
 
 
350e3ac
dda9ab5
ee6f18e
ccc774e
 
 
 
 
 
 
 
 
 
 
 
ee6f18e
edbe2ee
 
 
 
 
ee6f18e
ccc774e
ee6f18e
 
edbe2ee
 
 
 
 
 
 
ee6f18e
edbe2ee
 
 
 
 
ee6f18e
 
edbe2ee
 
 
 
 
 
 
 
ee6f18e
edbe2ee
ccc774e
edbe2ee
ee6f18e
edbe2ee
 
 
 
 
 
 
 
 
 
ee6f18e
edbe2ee
 
ee6f18e
edbe2ee
ee6f18e
 
 
 
 
 
 
 
 
 
d7ed0bf
ee6f18e
 
 
d7ed0bf
ee6f18e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import torch
import gradio as gr

#Setting device to cuda
torch.set_default_device("cuda")

device = 'cuda' if torch.cuda.is_available() else 'cpu'


# model_name = "deepapaikar/katzbot-phi2"

# model = AutoModelForCausalLM.from_from_pretrained(model_name)


# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("deepapaikar/katzbot-phi2",
                                             torch_dtype=torch.float16, 
                                             device_map="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("deepapaikar/katzbot-phi2", trust_remote_code=True)


# pipeline = transformers.pipeline(
#     "text-generation",
#     model=model,
#     torch_dtype=torch.float16,
# )

# tokenizer = AutoTokenizer.from_pretrained(model_name)


# def predict_answer(question, token=25):
    
#     messages = [{"role": "user", "content": f"{question}"}]
    
    
#     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True).to(device)
    
    
#     outputs = pipeline(prompt, max_new_tokens=token, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    
#     return outputs[0]["generated_text"]

def predict_answer(question, token=25):
    messages = [{"role": "user", "content": f"{question}"}]
    
    # Generate prompt text using the chat template
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize the prompt text to create input IDs suitable for the model
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True)
    
    # Move the tensor to the specified device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Use the model directly for inference
    model.eval() # Ensure the model is in evaluation mode
    model.to(device)  # Ensure the model is on the correct device
    
    # Generate outputs
    output_sequences = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=token + inputs['input_ids'].shape[-1],  # Adjust max_length accordingly
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
    )
    
    # Decode the output sequences to text
    output_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    
    return output_text



def gradio_predict(question, token):
    answer = predict_answer(question, token)
    return answer

# Define the Gradio interface
iface = gr.Interface(
    fn=gradio_predict,
    inputs=[gr.Textbox(label="Question", placeholder="e.g. Where is Yeshiva University located?", scale=4),
           gr.Slider(2, 100, value=25, label="Token Count", info="Choose between 2 and 100")],
    outputs=gr.TextArea(label="Answer"),
    title="KatzBot",
    description="Phi2-trial1",
)

# Launch the app
iface.queue().launch(debug=True)