Fiqa commited on
Commit
c6c4d1c
·
verified ·
1 Parent(s): 2ac23f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -25
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  from huggingface_hub import login
3
  from transformers import BlipProcessor, BlipForConditionalGeneration
 
4
  from PIL import Image
5
 
6
  import gradio as gr
@@ -31,11 +32,21 @@ login(token=hf_token)
31
 
32
 
33
 
 
 
 
 
 
 
 
 
 
 
34
  # Load the processor and model
35
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
36
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
37
- processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
38
- model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
39
  # pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
40
  from diffusers import FluxPipeline
41
 
@@ -58,7 +69,7 @@ def generate_caption_and_image(image, f, p, d):
58
  img = image.convert("RGB")
59
  # reader = easyocr.Reader(['en'])
60
  # # result = reader.readtext(img)
61
- import random
62
 
63
 
64
 
@@ -67,33 +78,27 @@ def generate_caption_and_image(image, f, p, d):
67
 
68
 
69
 
70
- text = "a picture of "
71
- inputs = processor(img, text, return_tensors="pt").to(device)
72
 
73
- out = model2.generate(**inputs, num_beams = 3)
74
 
75
 
76
 
77
- caption2 = processor.decode(out[0], skip_special_tokens=True)
78
 
79
  # Generate caption
80
- inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
81
- inputs = {key: val.to(device) for key, val in inputs.items()}
82
- out = model.generate(**inputs)
83
- caption1 = processor.decode(out[0], skip_special_tokens=True)
84
 
85
- prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
86
-
87
- image = pipe(
88
- prompt,
89
- height=1024,
90
- width=1024,
91
- guidance_scale=3.5,
92
- num_inference_steps=50,
93
- max_sequence_length=512,
94
- generator=torch.Generator("cpu").manual_seed(0)
95
- ).images[0]
96
- return image
97
 
98
 
99
 
@@ -103,6 +108,14 @@ def generate_caption_and_image(image, f, p, d):
103
  # generated_image1 =pipe(prompt).images[0]
104
 
105
  # return generated_image, generated_image1
 
 
 
 
 
 
 
 
106
  return None
107
  # Gradio UI
108
  iface = gr.Interface(
 
1
  import os
2
  from huggingface_hub import login
3
  from transformers import BlipProcessor, BlipForConditionalGeneration
4
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
5
  from PIL import Image
6
 
7
  import gradio as gr
 
32
 
33
 
34
 
35
+
36
+ model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
37
+
38
+ model = MllamaForConditionalGeneration.from_pretrained(
39
+ model_id,
40
+ torch_dtype=torch.bfloat16,
41
+ device_map="auto",
42
+ )
43
+ processor = AutoProcessor.from_pretrained(model_id)
44
+
45
  # Load the processor and model
46
+ # processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
47
+ # model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
48
+ # processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
49
+ # model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
50
  # pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
51
  from diffusers import FluxPipeline
52
 
 
69
  img = image.convert("RGB")
70
  # reader = easyocr.Reader(['en'])
71
  # # result = reader.readtext(img)
72
+ # import random
73
 
74
 
75
 
 
78
 
79
 
80
 
81
+ # text = "a picture of "
82
+ # inputs = processor(img, text, return_tensors="pt").to(device)
83
 
84
+ # out = model2.generate(**inputs, num_beams = 3)
85
 
86
 
87
 
88
+ # caption2 = processor.decode(out[0], skip_special_tokens=True)
89
 
90
  # Generate caption
91
+ # inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
92
+ # inputs = {key: val.to(device) for key, val in inputs.items()}
93
+ # out = model.generate(**inputs)
94
+ # caption1 = processor.decode(out[0], skip_special_tokens=True)
95
 
96
+ # prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
97
+
98
+
99
+
100
+
101
+
 
 
 
 
 
 
102
 
103
 
104
 
 
108
  # generated_image1 =pipe(prompt).images[0]
109
 
110
  # return generated_image, generated_image1
111
+ messages = [{"role": "user", "content": [{"type": "image"},{"type": "text", "text": "If I had to write a haiku for this one, it would be: "}]}]
112
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
113
+ inputs = processor(image,input_text,add_special_tokens=False,return_tensors="pt").to(model.device)
114
+
115
+ output = model.generate(**inputs, max_new_tokens=30)
116
+ caption =processor.decode(output[0])
117
+ image = pipe(prompt,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
118
+ return image
119
  return None
120
  # Gradio UI
121
  iface = gr.Interface(