Fiqa commited on
Commit
3aefc04
·
verified ·
1 Parent(s): 19bc041

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -39
app.py CHANGED
@@ -34,21 +34,12 @@ login(token =HUGGINGFACE_TOKEN)
34
 
35
 
36
 
37
- model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
38
-
39
- model = MllamaForConditionalGeneration.from_pretrained(
40
- model_id,
41
- torch_dtype=torch.bfloat16,
42
- device_map="auto",
43
- )
44
- processor = AutoProcessor.from_pretrained(model_id)
45
-
46
- # Load the processor and model
47
- # processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
48
- # model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
49
- # processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
50
- # model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
51
- # pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
52
  from diffusers import FluxPipeline
53
 
54
  pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
@@ -61,6 +52,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
61
 
62
  model.to(device)
63
  pipe.to(device)
 
64
 
65
 
66
 
@@ -79,22 +71,22 @@ def generate_caption_and_image(image, f, p, d):
79
 
80
 
81
 
82
- # text = "a picture of "
83
- # inputs = processor(img, text, return_tensors="pt").to(device)
84
 
85
- # out = model2.generate(**inputs, num_beams = 3)
86
 
87
 
88
 
89
- # caption2 = processor.decode(out[0], skip_special_tokens=True)
90
 
91
- # Generate caption
92
- # inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
93
- # inputs = {key: val.to(device) for key, val in inputs.items()}
94
- # out = model.generate(**inputs)
95
- # caption1 = processor.decode(out[0], skip_special_tokens=True)
96
 
97
- # prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
98
 
99
 
100
 
@@ -102,20 +94,8 @@ def generate_caption_and_image(image, f, p, d):
102
 
103
 
104
 
105
-
106
-
107
- # # Generate image based on the caption
108
- # generated_image = pipe(prompt).images[0]
109
- # generated_image1 =pipe(prompt).images[0]
110
-
111
- # return generated_image, generated_image1
112
- messages = [{"role": "user", "content": [{"type": "image"},{"type": "text", "text": "If I had to write a haiku for this one, it would be: "}]}]
113
- input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
114
- inputs = processor(img,input_text,add_special_tokens=False,return_tensors="pt").to(device)
115
-
116
- output = model.generate(**inputs, max_new_tokens=30)
117
- caption =processor.decode(output[0])
118
- image = pipe(caption,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
119
  return image
120
  return None
121
  # Gradio UI
 
34
 
35
 
36
 
37
+ Load the processor and model
38
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
39
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
40
+ processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
41
+ model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
42
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
 
 
 
 
 
 
 
 
 
43
  from diffusers import FluxPipeline
44
 
45
  pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
 
52
 
53
  model.to(device)
54
  pipe.to(device)
55
+ model2.to(device)
56
 
57
 
58
 
 
71
 
72
 
73
 
74
+ text = "a picture of "
75
+ inputs = processor(img, text, return_tensors="pt").to(device)
76
 
77
+ out = model2.generate(**inputs, num_beams = 3)
78
 
79
 
80
 
81
+ caption2 = processor.decode(out[0], skip_special_tokens=True)
82
 
83
+ Generate caption
84
+ inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
85
+ inputs = {key: val.to(device) for key, val in inputs.items()}
86
+ out = model.generate(**inputs)
87
+ caption1 = processor.decode(out[0], skip_special_tokens=True)
88
 
89
+ prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
90
 
91
 
92
 
 
94
 
95
 
96
 
97
+
98
+ image = pipe(prompt,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
 
 
 
 
 
 
 
 
 
 
 
 
99
  return image
100
  return None
101
  # Gradio UI