Upload 2 files

Browse files

Files changed (2) hide show

batch_processing_example.py +78 -0
single_image_example.py +55 -0

batch_processing_example.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+from tqdm import tqdm
+from pathlib import Path
+from os.path import join as opj
+from os import listdir
+model_name_or_path="Minthy/ToriiGate-v0.3"
+s_dir='./images_to_caption'
+caption_suffix='_caption.txt' #suffix for generated captions
+tags_suffix='_tags.txt' #suggix for file with booru tags
+use_tags=True #set to True for using with reference tags
+image_extensions=['.jpg','.png','.webp','.jpeg']
+DEVICE = "cuda:0" #change to your device
+processor = AutoProcessor.from_pretrained(model_name_or_path) #or change to local path
+model = AutoModelForVision2Seq.from_pretrained(
+    model_name_or_path,
+    torch_dtype=torch.bfloat16,
+#	_attn_implementation="flash_attention_2", #if installed
+).to(DEVICE)
+filelist=[fn for fn in listdir(s_dir) if any([fn.endswith(a) for a in image_extensions])]
+for fn in tqdm(filelist,desc='Captioninig'):
+    image = load_image(opj(s_dir,fn))
+    ###Trained options
+    user_prompt="Describe the picture in structuted json-like format."
+    #user_prompt="Give a long and detailed description of the picture."
+    #user_prompt="Describe the picture briefly."
+    ###Any other questions or instructions
+    #user_prompt="What color is the ribbon in the character's hair?"
+    #...
+    #Optional, add booru tags
+    if use_tags:
+        try:
+            tags=open(opj(s_dir,Path(fn).stem+tags_suffix)).read().strip()
+            user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.'
+            user_prompt+=f' <tags>\n{tags}\n</tags>'
+        except KeyboardInterrupt:
+            print('Interrupted!')
+            quit()
+        except Exception as err:
+            print(err)
+            continue
+    messages = [
+        {
+        #Important!
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": user_prompt}
+            ]
+        }
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[image], return_tensors="pt")
+    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+    # Generate
+    generated_ids = model.generate(**inputs, max_new_tokens=500)
+    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    caption=generated_texts[0].split('Assistant: ')[1]
+    with open(opj(s_dir,Path(fn).stem+caption_suffix),'w',encoding='utf-8',errors='ignore') as outf:
+        outf.write(caption)

single_image_example.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import torch
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+model_name_or_path="Minthy/ToriiGate-v0.3"
+DEVICE = "cuda:0" #change to your device
+processor = AutoProcessor.from_pretrained(model_name_or_path) #or change to local path
+model = AutoModelForVision2Seq.from_pretrained(
+    model_name_or_path,
+    torch_dtype=torch.bfloat16,
+#	_attn_implementation="flash_attention_2", #if installed
+).to(DEVICE)
+image = load_image('./image.jpg') #path to your picture
+###Trained options
+user_prompt="Describe the picture in structuted json-like format."
+#user_prompt="Give a long and detailed description of the picture."
+#user_prompt="Describe the picture briefly."
+###Any other questions or instructions
+#user_prompt="What color is the ribbon in the character's hair?"
+#...
+#Optional, add booru tags
+#tags='1girl, standing, looking at viewer, ...'
+#user_prompt+=' Also here are booru tags for better understanding of the picture, you can use them as reference.'
+#user_prompt+=f' <tags>\n{tags}\n</tags>'
+messages = [
+    {
+    #Important!
+        "role": "system",
+        "content": [
+            {"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his hask."}
+        ]
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "text", "text": user_prompt}
+        ]
+    }
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+inputs = processor(text=prompt, images=[image], return_tensors="pt")
+inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+# Generate
+generated_ids = model.generate(**inputs, max_new_tokens=500)
+generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+caption=generated_texts[0].split('Assistant: ')[1]
+print(caption)