Rafay17 commited on
Commit
b75c2ab
·
verified ·
1 Parent(s): cea2d18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -42
app.py CHANGED
@@ -1,57 +1,56 @@
1
- from transformers import AutoTokenizer, TextStreamer
2
  from unsloth import FastLanguageModel
3
- import torch
4
 
5
  # Load the model and tokenizer
6
- model_name = "Rafay17/Llama3.2_1b_customModle2" # Your model name
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
-
9
- model = FastLanguageModel.from_pretrained(
10
- model_name=model_name,
11
- max_seq_length=512, # Adjust as needed
12
- dtype="float16", # Adjust as needed
13
- load_in_4bit=True # Adjust based on your needs
14
- )
15
-
16
- FastLanguageModel.for_inference(model) # Call this immediately after loading the model
17
 
18
  # Function to generate a response
19
- def generate_response(input_text):
20
- # Prepare the labeled prompt for the model
21
- labeled_prompt = f"User Input: {input_text}\nResponse:"
22
 
23
- # Prepare the input for the model
24
  inputs = tokenizer(
25
  [labeled_prompt],
26
  return_tensors="pt",
27
  padding=True,
28
  truncation=True,
29
- max_length=512, # Ensure this matches your model's max length
30
  ).to("cuda")
31
 
32
- # Set up the text streamer to stream the generated response
33
  text_streamer = TextStreamer(tokenizer, skip_prompt=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Generate the response
36
- with torch.no_grad(): # Disable gradient calculation for inference
37
- model.generate(
38
- input_ids=inputs.input_ids,
39
- attention_mask=inputs.attention_mask,
40
- streamer=text_streamer,
41
- max_new_tokens=100, # Adjust this value as needed
42
- pad_token_id=tokenizer.eos_token_id,
43
- )
44
-
45
- # Function to take user input and generate output
46
- def user_interaction():
47
- print("Welcome to the Chatbot! Type 'exit' to quit.")
48
- while True:
49
- user_input = input("You: ")
50
- if user_input.lower() == 'exit':
51
- print("Exiting the chatbot. Goodbye!")
52
- break
53
- print("Chatbot is generating a response...")
54
- generate_response(user_input)
55
-
56
- # Start the user interaction
57
- user_interaction()
 
1
+ import gradio as gr
2
  from unsloth import FastLanguageModel
3
+ from transformers import AutoTokenizer, TextStreamer
4
 
5
  # Load the model and tokenizer
6
+ model_name = "Rafay17/Llama3.2_1b_customModel2" # Your custom model
7
+ model, tokenizer = FastLanguageModel.from_pretrained(model_name)
8
+ FastLanguageModel.for_inference(model) # Enable the model for inference
 
 
 
 
 
 
 
 
9
 
10
  # Function to generate a response
11
+ def generate_response(message, history, max_tokens, temperature, top_p):
12
+ # Prepare the labeled prompt for response generation
13
+ labeled_prompt = f"User Input: {message}\nResponse:"
14
 
15
+ # Tokenize the input
16
  inputs = tokenizer(
17
  [labeled_prompt],
18
  return_tensors="pt",
19
  padding=True,
20
  truncation=True,
21
+ max_length=512,
22
  ).to("cuda")
23
 
24
+ # Generate the response
25
  text_streamer = TextStreamer(tokenizer, skip_prompt=True)
26
+ response = ""
27
+ for token in model.generate(
28
+ input_ids=inputs.input_ids,
29
+ attention_mask=inputs.attention_mask,
30
+ streamer=text_streamer,
31
+ max_new_tokens=max_tokens,
32
+ temperature=temperature,
33
+ top_p=top_p,
34
+ pad_token_id=tokenizer.eos_token_id,
35
+ ):
36
+ response += token
37
+
38
+ return response
39
+
40
+
41
+ # Define the Gradio interface
42
+ demo = gr.Interface(
43
+ fn=generate_response,
44
+ inputs=[
45
+ gr.Textbox(lines=2, placeholder="Enter your message here..."),
46
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
47
+ gr.Slider(minimum=1, maximum=512, value=64, label="Max new tokens"),
48
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature"),
49
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p (nucleus sampling)"),
50
+ ],
51
+ outputs=gr.Textbox(label="Chatbot Response"),
52
+ live=True
53
+ )
54
 
55
+ if __name__ == "__main__":
56
+ demo.launch()