redael commited on
Commit
6765159
·
verified ·
1 Parent(s): a1e1bf3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -33
app.py CHANGED
@@ -1,48 +1,37 @@
1
 
2
- import os
3
  import gradio as gr
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
 
6
 
7
- # Load your model and tokenizer from Hugging Face
8
- model_name = 'redael/model_udc'
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- model = AutoModelForCausalLM.from_pretrained(model_name)
11
 
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device)
 
 
14
 
15
- # Function to generate response
16
- def generate_response(message, history, system_message, max_tokens, temperature, top_p):
17
- # Prepare the conversation history
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for user_msg, bot_msg in history:
21
- if user_msg:
22
- messages.append({"role": "user", "content": user_msg})
23
- if bot_msg:
24
- messages.append({"role": "assistant", "content": bot_msg})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- # Tokenize and prepare the input
29
- prompt = "\n".join([f"{msg['role'].capitalize()}: {msg['content']}" for msg in messages])
30
  inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
31
-
32
- # Generate the response
33
  outputs = model.generate(
34
  inputs['input_ids'],
35
- max_length=max_tokens,
36
  num_return_sequences=1,
37
  pad_token_id=tokenizer.eos_token_id,
 
38
  temperature=temperature,
39
  top_p=top_p,
40
- early_stopping=True,
41
- do_sample=True # Enable sampling
42
  )
43
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
44
 
45
- # Clean up the response
46
  response = response.split("Assistant:")[-1].strip()
47
  response_lines = response.split('\n')
48
  clean_response = []
@@ -50,15 +39,28 @@ def generate_response(message, history, system_message, max_tokens, temperature,
50
  if "User:" not in line and "Assistant:" not in line:
51
  clean_response.append(line)
52
  response = ' '.join(clean_response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- return [(message, response)]
55
 
56
- # Create the Gradio chat interface
57
  demo = gr.ChatInterface(
58
- fn=generate_response,
59
-
60
- title="Chatbot",
61
- description="Ask anything to the chatbot."
62
  )
63
 
64
  if __name__ == "__main__":
 
1
 
 
2
  import gradio as gr
 
3
  import torch
4
+ from transformers import GPT2Tokenizer, GPT2LMHeadModel
5
 
6
+ # Load the custom model and tokenizer
7
+ model_path = 'redael/model_udc'
8
+ tokenizer = GPT2Tokenizer.from_pretrained(model_path)
9
+ model = GPT2LMHeadModel.from_pretrained(model_path)
10
 
11
+ # Check if CUDA is available and use GPU if possible, enable FP16 precision
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device)
14
+ if device.type == 'cuda':
15
+ model = model.half() # Use FP16 precision
16
 
17
+ def generate_response(prompt, model, tokenizer, max_length=100, num_beams=1, temperature=0.7, top_p=0.9, repetition_penalty=1.0):
18
+ # Prepare the prompt
19
+ prompt = f"User: {prompt}\nAssistant:"
 
 
 
 
 
 
 
 
 
 
 
 
20
  inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
 
 
21
  outputs = model.generate(
22
  inputs['input_ids'],
23
+ max_length=max_length,
24
  num_return_sequences=1,
25
  pad_token_id=tokenizer.eos_token_id,
26
+ num_beams=num_beams, # Use a lower number of beams
27
  temperature=temperature,
28
  top_p=top_p,
29
+ repetition_penalty=repetition_penalty,
30
+ early_stopping=True
31
  )
32
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
33
 
34
+ # Post-processing to clean up the response
35
  response = response.split("Assistant:")[-1].strip()
36
  response_lines = response.split('\n')
37
  clean_response = []
 
39
  if "User:" not in line and "Assistant:" not in line:
40
  clean_response.append(line)
41
  response = ' '.join(clean_response)
42
+ return response.strip()
43
+
44
+ def respond(message, history: list[tuple[str, str]]):
45
+ # Prepare the prompt from the history and the new message
46
+ system_message = "You are a friendly chatbot."
47
+ conversation = system_message + "\n"
48
+ for user_message, assistant_response in history:
49
+ conversation += f"User: {user_message}\nAssistant: {assistant_response}\n"
50
+ conversation += f"User: {message}\nAssistant:"
51
+
52
+ # Fixed values for generation parameters
53
+ max_tokens = 100 # Reduce max tokens if possible
54
+ temperature = 0.7
55
+ top_p = 0.9
56
+
57
+ response = generate_response(conversation, model, tokenizer, max_length=max_tokens, temperature=temperature, top_p=top_p)
58
 
59
+ return response
60
 
61
+ # Gradio Chat Interface without customizable inputs
62
  demo = gr.ChatInterface(
63
+ respond
 
 
 
64
  )
65
 
66
  if __name__ == "__main__":