Update app.py
Browse files
app.py
CHANGED
@@ -30,15 +30,15 @@ from utils import (
|
|
30 |
|
31 |
# Initialize the model and tokenizer.
|
32 |
api_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
33 |
-
|
34 |
-
model_name = "google/gemma-3-27b-it"
|
35 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=api_token)
|
36 |
-
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
37 |
-
|
38 |
-
model = Gemma3ForCausalLM.from_pretrained(model_name, token=api_token, quantization_config=quantization_config, torch_dtype="auto")
|
39 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
40 |
model = model.eval()
|
41 |
-
|
42 |
embedding_model = HuggingFaceBgeEmbeddings(
|
43 |
model_name="BAAI/bge-large-en-v1.5",
|
44 |
model_kwargs={"device": str(device)},
|
@@ -577,14 +577,14 @@ def chat_response_stream(message: str, history: list, state: dict, compression_d
|
|
577 |
streamer=streamer,
|
578 |
use_cache=True,
|
579 |
max_new_tokens=1024,
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
temperature=1.0,
|
585 |
-
top_k=64,
|
586 |
-
top_p=0.95,
|
587 |
-
min_p=0.0
|
588 |
)
|
589 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
590 |
t.start()
|
|
|
30 |
|
31 |
# Initialize the model and tokenizer.
|
32 |
api_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
|
33 |
+
model_name = "meta-llama/Llama-3.1-8B-Instruct"
|
34 |
+
# model_name = "google/gemma-3-27b-it"
|
35 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=api_token)
|
36 |
+
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
37 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, token=api_token, torch_dtype=torch.float16)
|
38 |
+
# model = Gemma3ForCausalLM.from_pretrained(model_name, token=api_token, quantization_config=quantization_config, torch_dtype="auto")
|
39 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
40 |
model = model.eval()
|
41 |
+
model.to(device)
|
42 |
embedding_model = HuggingFaceBgeEmbeddings(
|
43 |
model_name="BAAI/bge-large-en-v1.5",
|
44 |
model_kwargs={"device": str(device)},
|
|
|
577 |
streamer=streamer,
|
578 |
use_cache=True,
|
579 |
max_new_tokens=1024,
|
580 |
+
num_beams=1,
|
581 |
+
do_sample=False,
|
582 |
+
top_p=1.0,
|
583 |
+
top_k=None,
|
584 |
temperature=1.0,
|
585 |
+
# top_k=64,
|
586 |
+
# top_p=0.95,
|
587 |
+
# min_p=0.0
|
588 |
)
|
589 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
590 |
t.start()
|