suriya7 commited on
Commit
7c06fd6
·
verified ·
1 Parent(s): d013d1f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +41 -16
README.md CHANGED
@@ -43,32 +43,57 @@ while True:
43
  conversation_history = conversation_history[-5:]
44
 
45
  # Build the full prompt
46
- prompt = prompt + "\n".join(conversation_history)
47
 
48
  # Tokenize the prompt
49
- encodeds = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
50
 
51
  # Move model and inputs to the appropriate device
52
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
53
  model.to(device)
54
  inputs = encodeds.to(device)
55
 
56
- # Generate the model's response
57
- generated_ids = model.generate(
58
- inputs,
59
- max_new_tokens=512,
60
- pad_token_id=50259,
61
- eos_token_id=50259,
62
- num_return_sequences=1,
63
- )
64
-
65
- # Decode and process the model's response
66
- ans = tokenizer.decode(generated_ids[0])
67
- assistant_response = ans.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "").strip()
68
- print(f"Assistant: {assistant_response}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # Add the assistant's response to the conversation history
71
- conversation_history.append(f"<|im_start|>assistant\n{assistant_response}<|im_end|>")
 
72
  ```
73
 
74
 
 
43
  conversation_history = conversation_history[-5:]
44
 
45
  # Build the full prompt
46
+ current_prompt = prompt + "\n".join(conversation_history)
47
 
48
  # Tokenize the prompt
49
+ encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids
50
 
51
  # Move model and inputs to the appropriate device
52
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
53
  model.to(device)
54
  inputs = encodeds.to(device)
55
 
56
+ # Create an empty list to store generated tokens
57
+ generated_ids = inputs
58
+
59
+ # Start generating tokens one by one
60
+ assistant_response = ""
61
+ # print("Assistant: ", end="", flush=True) # Print "Assistant:" once before streaming starts
62
+ for _ in range(512): # Specify a max token limit for streaming
63
+ # Generate the next token in the sequence
64
+ next_token = model.generate(
65
+ generated_ids,
66
+ max_new_tokens=1,
67
+ pad_token_id=50259,
68
+ eos_token_id=50259,
69
+ num_return_sequences=1,
70
+ do_sample=True, # Use sampling for more diverse responses
71
+ top_k=50, # Limit to the top-k tokens to sample from
72
+ temperature=0.7, # Adjust temperature for randomness
73
+ top_p =0.90
74
+ )
75
+
76
+ # Add the generated token to the list
77
+ generated_ids = torch.cat([generated_ids, next_token[:, -1:]], dim=1)
78
+
79
+ # Decode the generated token (flatten it to a list of IDs)
80
+ token_id = next_token[0, -1].item() # Extract the last token as an integer
81
+ token = tokenizer.decode([token_id], skip_special_tokens=True)
82
+
83
+
84
+ # Append the token to the ongoing response
85
+ assistant_response += token
86
+ print(token, end="", flush=True) # Stream the token in real time
87
+
88
+ # If EOS token is encountered, stop generating
89
+ if token_id == 50259: # EOS token
90
+ break
91
+
92
+ print() # Print a newline after streaming is complete
93
 
94
  # Add the assistant's response to the conversation history
95
+ conversation_history.append(f"<|im_start|>{assistant_response.strip()}<|im_end|>")
96
+
97
  ```
98
 
99