suriya7
/

conversational-gpt-V1

Text Generation

Transformers

Safetensors

gpt2

text-generation-inference

Model card Files Files and versions Community

suriya7 commited on Nov 18, 2024

Commit

7c06fd6

verified ·

1 Parent(s): d013d1f

Update README.md

Browse files

Files changed (1) hide show

README.md +41 -16

README.md CHANGED Viewed

@@ -43,32 +43,57 @@ while True:
     conversation_history = conversation_history[-5:]
     # Build the full prompt
-    prompt = prompt + "\n".join(conversation_history)
     # Tokenize the prompt
-    encodeds = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
     # Move model and inputs to the appropriate device
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     model.to(device)
     inputs = encodeds.to(device)
-    # Generate the model's response
-    generated_ids = model.generate(
-        inputs,
-        max_new_tokens=512,
-        pad_token_id=50259,
-        eos_token_id=50259,
-        num_return_sequences=1,
-    )
-    # Decode and process the model's response
-    ans = tokenizer.decode(generated_ids[0])
-    assistant_response = ans.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "").strip()
-    print(f"Assistant: {assistant_response}")
     # Add the assistant's response to the conversation history
-    conversation_history.append(f"<|im_start|>assistant\n{assistant_response}<|im_end|>")
 ```

     conversation_history = conversation_history[-5:]
     # Build the full prompt
+    current_prompt = prompt + "\n".join(conversation_history)
     # Tokenize the prompt
+    encodeds = tokenizer(current_prompt, return_tensors="pt", truncation=True).input_ids
     # Move model and inputs to the appropriate device
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     model.to(device)
     inputs = encodeds.to(device)
+    # Create an empty list to store generated tokens
+    generated_ids = inputs
+    # Start generating tokens one by one
+    assistant_response = ""
+    # print("Assistant: ", end="", flush=True)  # Print "Assistant:" once before streaming starts
+    for _ in range(512):  # Specify a max token limit for streaming
+        # Generate the next token in the sequence
+        next_token = model.generate(
+            generated_ids,
+            max_new_tokens=1,
+            pad_token_id=50259,
+            eos_token_id=50259,
+            num_return_sequences=1,
+            do_sample=True,  # Use sampling for more diverse responses
+            top_k=50,        # Limit to the top-k tokens to sample from
+            temperature=0.7, # Adjust temperature for randomness
+            top_p =0.90
+        )
+        # Add the generated token to the list
+        generated_ids = torch.cat([generated_ids, next_token[:, -1:]], dim=1)
+        # Decode the generated token (flatten it to a list of IDs)
+        token_id = next_token[0, -1].item()  # Extract the last token as an integer
+        token = tokenizer.decode([token_id], skip_special_tokens=True)
+        # Append the token to the ongoing response
+        assistant_response += token
+        print(token, end="", flush=True)  # Stream the token in real time
+        # If EOS token is encountered, stop generating
+        if token_id == 50259:  # EOS token
+            break
+    print()  # Print a newline after streaming is complete
     # Add the assistant's response to the conversation history
+    conversation_history.append(f"<|im_start|>{assistant_response.strip()}<|im_end|>")
 ```