quarterturn
commited on
Commit
·
6c65cfa
1
Parent(s):
e5c051b
updated main.py to support the use of a quant molmo
Browse files
README.md
CHANGED
@@ -31,18 +31,19 @@ A 12GB GPU should be fine.
|
|
31 |
3. install the dependencies
|
32 |
``` pip3 install -r requirements.txt ```
|
33 |
4. run the gradio version:
|
34 |
-
``` python3 main.py ```
|
|
|
|
|
35 |
1. create a zip file of images
|
36 |
2. upload it
|
37 |
3. process it
|
38 |
4. click the button to download the caption zip file, the link is at the top of the page
|
39 |
|
40 |
run the command-line version:
|
41 |
-
``` python3 caption.py ``` (use molmo at bf16
|
42 |
-
``` python3 caption.py -q ``` (use
|
43 |
1. make sure your images are in the "images" directory
|
44 |
2. captions will be placed in the "images" directory
|
45 |
|
46 |
Note:
|
47 |
-
- main.py (gradio version does not yet support quant model)
|
48 |
- If torch sees your first GPU supports flash attention and the others do not, it will assume all the cards do and it will throw an exception. A workaround is to use, for example, "CUDA_VISIBLE_DEVICES=0 python3 main.py (or caption.py)", to force torch to ignore the card supporting flash attention, so that it will use your other cards without it. Or, use it to exclude non-flash-attention-supporting GPUs.
|
|
|
31 |
3. install the dependencies
|
32 |
``` pip3 install -r requirements.txt ```
|
33 |
4. run the gradio version:
|
34 |
+
``` python3 main.py ``` (use original molmo model at bf16)
|
35 |
+
or
|
36 |
+
``` python3 main.py -q``` (use 4bit quant molmo model)
|
37 |
1. create a zip file of images
|
38 |
2. upload it
|
39 |
3. process it
|
40 |
4. click the button to download the caption zip file, the link is at the top of the page
|
41 |
|
42 |
run the command-line version:
|
43 |
+
``` python3 caption.py ``` (use original molmo model at bf16)
|
44 |
+
``` python3 caption.py -q ``` (use 4bit quant molmo model)
|
45 |
1. make sure your images are in the "images" directory
|
46 |
2. captions will be placed in the "images" directory
|
47 |
|
48 |
Note:
|
|
|
49 |
- If torch sees your first GPU supports flash attention and the others do not, it will assume all the cards do and it will throw an exception. A workaround is to use, for example, "CUDA_VISIBLE_DEVICES=0 python3 main.py (or caption.py)", to force torch to ignore the card supporting flash attention, so that it will use your other cards without it. Or, use it to exclude non-flash-attention-supporting GPUs.
|
main.py
CHANGED
@@ -24,6 +24,11 @@ def cleanup_temp_files():
|
|
24 |
if os.path.isdir(dir_path):
|
25 |
shutil.rmtree(dir_path)
|
26 |
|
|
|
|
|
|
|
|
|
|
|
27 |
if torch.cuda.is_available():
|
28 |
device = torch.device("cuda")
|
29 |
print("GPU is available. Using CUDA.")
|
@@ -31,9 +36,8 @@ else:
|
|
31 |
device = torch.device("cpu")
|
32 |
print("GPU is not available. Using CPU.")
|
33 |
|
34 |
-
#
|
35 |
local_path = "./model/Molmo-7B-D-0924"
|
36 |
-
#print("Loading processor from local path...")
|
37 |
processor = AutoProcessor.from_pretrained(
|
38 |
local_path,
|
39 |
local_files_only=True,
|
@@ -41,23 +45,26 @@ processor = AutoProcessor.from_pretrained(
|
|
41 |
torch_dtype='auto',
|
42 |
device_map='auto'
|
43 |
)
|
44 |
-
#print("Processor loaded.")
|
45 |
-
|
46 |
-
print("Loading model from local path...")
|
47 |
-
model = AutoModelForCausalLM.from_pretrained(
|
48 |
-
local_path,
|
49 |
-
trust_remote_code=True,
|
50 |
-
torch_dtype='auto',
|
51 |
-
device_map='auto',
|
52 |
-
)
|
53 |
-
#print("Model loaded.")
|
54 |
-
|
55 |
-
generation_config = GenerationConfig(max_new_tokens=300, stop_strings="<|endoftext|>")
|
56 |
-
bits_and_bytes_config = BitsAndBytesConfig()
|
57 |
|
58 |
-
#
|
59 |
-
|
60 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def unzip_images(zip_file):
|
63 |
# Create a unique directory for extracted images inside the "images" directory
|
|
|
24 |
if os.path.isdir(dir_path):
|
25 |
shutil.rmtree(dir_path)
|
26 |
|
27 |
+
# Parse command-line arguments
|
28 |
+
parser = argparse.ArgumentParser(description="Load and use a quantized model")
|
29 |
+
parser.add_argument("-q", "--use_quant", action="store_true", help="Use quantized model")
|
30 |
+
args = parser.parse_args()
|
31 |
+
|
32 |
if torch.cuda.is_available():
|
33 |
device = torch.device("cuda")
|
34 |
print("GPU is available. Using CUDA.")
|
|
|
36 |
device = torch.device("cpu")
|
37 |
print("GPU is not available. Using CPU.")
|
38 |
|
39 |
+
# Load the processor
|
40 |
local_path = "./model/Molmo-7B-D-0924"
|
|
|
41 |
processor = AutoProcessor.from_pretrained(
|
42 |
local_path,
|
43 |
local_files_only=True,
|
|
|
45 |
torch_dtype='auto',
|
46 |
device_map='auto'
|
47 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
# Load the model
|
50 |
+
if args.use_quant:
|
51 |
+
# Load the quantized model
|
52 |
+
quantized_local_path = "./model/molmo-7B-D-bnb-4bit"
|
53 |
+
model = AutoModelForCausalLM.from_pretrained(
|
54 |
+
quantized_local_path,
|
55 |
+
trust_remote_code=True,
|
56 |
+
torch_dtype='auto',
|
57 |
+
device_map='auto',
|
58 |
+
)
|
59 |
+
else:
|
60 |
+
# Load the non-quantized model
|
61 |
+
model = AutoModelForCausalLM.from_pretrained(
|
62 |
+
local_path,
|
63 |
+
trust_remote_code=True,
|
64 |
+
torch_dtype='auto',
|
65 |
+
device_map='auto',
|
66 |
+
)
|
67 |
+
model.to(dtype=torch.bfloat16)
|
68 |
|
69 |
def unzip_images(zip_file):
|
70 |
# Create a unique directory for extracted images inside the "images" directory
|