Update app.py
Browse files
app.py
CHANGED
@@ -17,7 +17,7 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
|
17 |
from langchain_docling import DoclingLoader
|
18 |
from langchain_docling.loader import ExportType
|
19 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
20 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, TextIteratorStreamer, BitsAndBytesConfig
|
21 |
from transformers.models.llama.modeling_llama import rotate_half
|
22 |
import threading
|
23 |
import shutil
|
@@ -35,7 +35,7 @@ model_name = "google/gemma-3-27b-it"
|
|
35 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=api_token)
|
36 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
37 |
# model = AutoModelForCausalLM.from_pretrained(model_name, token=api_token, torch_dtype=torch.float16)
|
38 |
-
model =
|
39 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
40 |
model = model.eval()
|
41 |
# model.to(device)
|
|
|
17 |
from langchain_docling import DoclingLoader
|
18 |
from langchain_docling.loader import ExportType
|
19 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, TextIteratorStreamer, BitsAndBytesConfig, Gemma3ForCausalLM
|
21 |
from transformers.models.llama.modeling_llama import rotate_half
|
22 |
import threading
|
23 |
import shutil
|
|
|
35 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=api_token)
|
36 |
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
|
37 |
# model = AutoModelForCausalLM.from_pretrained(model_name, token=api_token, torch_dtype=torch.float16)
|
38 |
+
model = Gemma3ForCausalLM.from_pretrained(model_name, token=api_token, quantization_config=quantization_config, torch_dtype="auto")
|
39 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
40 |
model = model.eval()
|
41 |
# model.to(device)
|