IST199655 commited on
Commit
344f6f5
·
1 Parent(s): f7bf18e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -55
app.py CHANGED
@@ -10,72 +10,19 @@ import torch
10
  from threading import Thread
11
 
12
  # Load model and tokenizer globally to avoid reloading for every request
 
13
  model_path = "Heit39/llama_lora_model_1"
14
 
15
  # Load tokenizer
16
  tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
17
 
18
  # Load the base model (e.g., LLaMA)
19
- base_model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-3B-Instruct")
20
 
21
  # Load LoRA adapter
22
  from peft import PeftModel
23
  model = PeftModel.from_pretrained(base_model, model_path)
24
 
25
-
26
- # Define the response function
27
- # def respond(
28
- # message: str,
29
- # history: list[tuple[str, str]],
30
- # system_message: str,
31
- # max_tokens: int,
32
- # temperature: float,
33
- # top_p: float,
34
- # ):
35
- # # Combine system message and history into a single prompt
36
- # messages = [{"role": "system", "content": system_message}]
37
- # for val in history:
38
- # if val[0]:
39
- # messages.append({"role": "user", "content": val[0]})
40
- # if val[1]:
41
- # messages.append({"role": "assistant", "content": val[1]})
42
- # messages.append({"role": "user", "content": message})
43
-
44
- # # Create a single text prompt from the messages
45
- # prompt = ""
46
- # for msg in messages:
47
- # if msg["role"] == "system":
48
- # prompt += f"[System]: {msg['content']}\n\n"
49
- # elif msg["role"] == "user":
50
- # prompt += f"[User]: {msg['content']}\n\n"
51
- # elif msg["role"] == "assistant":
52
- # prompt += f"[Assistant]: {msg['content']}\n\n"
53
-
54
- # # Tokenize the prompt
55
- # inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
56
- # input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
57
-
58
- # # Generate response
59
- # output_ids = model.generate(
60
- # input_ids,
61
- # max_length=input_ids.shape[1] + max_tokens,
62
- # temperature=temperature,
63
- # top_p=top_p,
64
- # do_sample=True,
65
- # )
66
-
67
- # # Decode the generated text
68
- # generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
69
-
70
- # # Extract the assistant's response from the generated text
71
- # assistant_response = generated_text[len(prompt):].strip()
72
-
73
- # # Yield responses incrementally (simulate streaming)
74
- # response = ""
75
- # for token in assistant_response.split(): # Split tokens by whitespace
76
- # response += token + " "
77
- # yield response.strip()
78
-
79
  def respond(
80
  message: str,
81
  history: list[tuple[str, str]],
 
10
  from threading import Thread
11
 
12
  # Load model and tokenizer globally to avoid reloading for every request
13
+ base_model = "unsloth/Llama-3.2-3B-Instruct"
14
  model_path = "Heit39/llama_lora_model_1"
15
 
16
  # Load tokenizer
17
  tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
18
 
19
  # Load the base model (e.g., LLaMA)
20
+ base_model = AutoModelForCausalLM.from_pretrained(base_model)
21
 
22
  # Load LoRA adapter
23
  from peft import PeftModel
24
  model = PeftModel.from_pretrained(base_model, model_path)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def respond(
27
  message: str,
28
  history: list[tuple[str, str]],