Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -36,10 +36,10 @@ if torch.cuda.is_available():
|
|
36 |
)
|
37 |
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
38 |
base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",quantization_config=bnb_config)
|
39 |
-
model = PeftModel.from_pretrained(
|
40 |
-
base_model,"ranamhamoud/storytell")
|
41 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
42 |
-
tokenizer.
|
|
|
43 |
|
44 |
def make_prompt(entry):
|
45 |
return f"### Human: YOUR INSTRUCTION HERE: {entry} ### Assistant:"
|
@@ -49,17 +49,20 @@ def generate(
|
|
49 |
message: str,
|
50 |
chat_history: list[tuple[str, str]],
|
51 |
max_new_tokens: int = 1024,
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
repetition_penalty: float = 1.
|
56 |
) -> Iterator[str]:
|
57 |
conversation = []
|
58 |
for user, assistant in chat_history:
|
59 |
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
|
60 |
conversation.append({"role": "user", "content": make_prompt(message)})
|
61 |
|
62 |
-
|
|
|
|
|
|
|
63 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
64 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
65 |
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
@@ -71,9 +74,9 @@ def generate(
|
|
71 |
streamer=streamer,
|
72 |
max_new_tokens=max_new_tokens,
|
73 |
do_sample=True,
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
num_beams=1,
|
78 |
repetition_penalty=repetition_penalty,
|
79 |
)
|
|
|
36 |
)
|
37 |
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
38 |
base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",quantization_config=bnb_config)
|
39 |
+
model = PeftModel.from_pretrained(model,"ranamhamoud/storytell")
|
|
|
40 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
41 |
+
tokenizer.pad_token = tokenizer.eos_token
|
42 |
+
|
43 |
|
44 |
def make_prompt(entry):
|
45 |
return f"### Human: YOUR INSTRUCTION HERE: {entry} ### Assistant:"
|
|
|
49 |
message: str,
|
50 |
chat_history: list[tuple[str, str]],
|
51 |
max_new_tokens: int = 1024,
|
52 |
+
temperature: float = 0.1, # Lower -> less random
|
53 |
+
top_p: float = 0.1, # Lower -> less random, considering only the top 10% of tokens at each step
|
54 |
+
top_k: int = 1, # Least random, only the most likely next token is considered
|
55 |
+
repetition_penalty: float = 1.0, # No repetition penalty
|
56 |
) -> Iterator[str]:
|
57 |
conversation = []
|
58 |
for user, assistant in chat_history:
|
59 |
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
|
60 |
conversation.append({"role": "user", "content": make_prompt(message)})
|
61 |
|
62 |
+
enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
|
63 |
+
|
64 |
+
|
65 |
+
input_ids = enc.input_ids
|
66 |
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
|
67 |
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
|
68 |
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
|
|
|
74 |
streamer=streamer,
|
75 |
max_new_tokens=max_new_tokens,
|
76 |
do_sample=True,
|
77 |
+
top_p=top_p,
|
78 |
+
top_k=top_k,
|
79 |
+
temperature=temperature,
|
80 |
num_beams=1,
|
81 |
repetition_penalty=repetition_penalty,
|
82 |
)
|