Spaces:

AIRRC
/

ELN-Llama-1B-base-chat

Running

App Files Files Community

diabolic6045 commited on Jan 21

Commit

3611d45

verified ·

1 Parent(s): 92db476

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -28

app.py CHANGED Viewed

@@ -9,31 +9,33 @@ model = AutoModelForCausalLM.from_pretrained("diabolic6045/ELN-Llama-1B-base")
 def generate_response(message, temperature, max_length):
     # Tokenize input
     inputs = tokenizer(message, return_tensors="pt", truncation=True, max_length=512)
-    # Initialize the generated text with the input message
-    generated_text = message
     # Generate response token by token
-    with torch.no_grad():
-        generated_ids = model.generate(
-            inputs["input_ids"],
-            max_length=max_length,
-            temperature=temperature,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id,
-            num_return_sequences=1,
-            return_dict_in_generate=True,
-            output_scores=True,
-        )
-        # Get the generated token ids (excluding the input prompt)
-        new_tokens = generated_ids.sequences[0][inputs["input_ids"].shape[1]:]
-        # Decode and yield tokens one by one
-        for i in range(len(new_tokens)):
-            next_token = tokenizer.decode(new_tokens[:i+1], skip_special_tokens=True)
-            generated_text += next_token
-            yield generated_text
 # Create the Gradio interface
 demo = gr.Interface(
@@ -47,17 +49,17 @@ demo = gr.Interface(
     title="LLaMA Text Completion",
     description="Generate text completions using the ELN-Llama-1B model. Enter the start of a text, and the model will continue it.",
     examples=[
-        ["Once upon a time in a magical forest", 0.7, 200],
-        ["The recipe for making the perfect chocolate cake requires", 0.7, 200],
-        ["In the year 2150, humanity had finally achieved", 0.7, 200],
-        ["The most important principles of effective programming are", 0.8, 300],
     ],
     article="""
     ## Tips for better completions:
     - Start with a clear and detailed prompt
     - Adjust temperature: Higher for creative writing, lower for factual completion
     - Adjust max length based on how much text you want to generate
-    """,
 )
 if __name__ == "__main__":

 def generate_response(message, temperature, max_length):
     # Tokenize input
     inputs = tokenizer(message, return_tensors="pt", truncation=True, max_length=512)
+    input_ids = inputs["input_ids"]
+    current_text = message
     # Generate response token by token
+    for _ in range(max_length - input_ids.shape[1]):
+        with torch.no_grad():
+            outputs = model(input_ids)
+            next_token_logits = outputs.logits[:, -1, :]
+            # Apply temperature
+            next_token_logits = next_token_logits / temperature
+            # Sample from the distribution
+            probs = torch.softmax(next_token_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            # Stop if we generate an EOS token
+            if next_token.item() == tokenizer.eos_token_id:
+                break
+            # Append the new token to input_ids
+            input_ids = torch.cat([input_ids, next_token], dim=-1)
+            # Decode only the new token and add it to current text
+            new_token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
+            current_text += new_token_text
+            yield current_text
 # Create the Gradio interface
 demo = gr.Interface(
     title="LLaMA Text Completion",
     description="Generate text completions using the ELN-Llama-1B model. Enter the start of a text, and the model will continue it.",
     examples=[
+        ["Once upon a time in a magical forest", 0.7, 50],
+        ["The recipe for making the perfect chocolate cake requires", 0.7, 50],
+        ["In the year 2150, humanity had finally achieved", 0.7, 50],
+        ["The most important principles of effective programming are", 0.8, 50],
     ],
     article="""
     ## Tips for better completions:
     - Start with a clear and detailed prompt
     - Adjust temperature: Higher for creative writing, lower for factual completion
     - Adjust max length based on how much text you want to generate
+    """
 )
 if __name__ == "__main__":