Spaces:

Fiqa
/

StyleSync

Runtime error

App Files Files Community

Fiqa commited on Jan 5

Commit

c6c4d1c

verified ·

1 Parent(s): 2ac23f6

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -25

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from huggingface_hub import login
 from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
 import gradio as gr
@@ -31,11 +32,21 @@ login(token=hf_token)
 # Load the processor and model
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
-model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
 # pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
 from diffusers import FluxPipeline
@@ -58,7 +69,7 @@ def generate_caption_and_image(image, f, p, d):
         img = image.convert("RGB")
         # reader = easyocr.Reader(['en'])
         # # result = reader.readtext(img)
-        import random
@@ -67,33 +78,27 @@ def generate_caption_and_image(image, f, p, d):
-        text = "a picture of "
-        inputs = processor(img, text, return_tensors="pt").to(device)
-        out = model2.generate(**inputs, num_beams = 3)
-        caption2 = processor.decode(out[0], skip_special_tokens=True)
         # Generate caption
-        inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
-        inputs = {key: val.to(device) for key, val in inputs.items()}
-        out = model.generate(**inputs)
-        caption1 = processor.decode(out[0], skip_special_tokens=True)
-        prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
-        image = pipe(
-        prompt,
-        height=1024,
-        width=1024,
-        guidance_scale=3.5,
-        num_inference_steps=50,
-        max_sequence_length=512,
-        generator=torch.Generator("cpu").manual_seed(0)
-        ).images[0]
-        return image
@@ -103,6 +108,14 @@ def generate_caption_and_image(image, f, p, d):
         # generated_image1 =pipe(prompt).images[0]
         # return generated_image, generated_image1
     return None
 # Gradio UI
 iface = gr.Interface(

 import os
 from huggingface_hub import login
 from transformers import BlipProcessor, BlipForConditionalGeneration
+from transformers import MllamaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import gradio as gr
+model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+model = MllamaForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+processor = AutoProcessor.from_pretrained(model_id)
 # Load the processor and model
+# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+# processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
+# model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
 # pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
 from diffusers import FluxPipeline
         img = image.convert("RGB")
         # reader = easyocr.Reader(['en'])
         # # result = reader.readtext(img)
+        # import random
+        # text = "a picture of "
+        # inputs = processor(img, text, return_tensors="pt").to(device)
+        # out = model2.generate(**inputs, num_beams = 3)
+        # caption2 = processor.decode(out[0], skip_special_tokens=True)
         # Generate caption
+        # inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
+        # inputs = {key: val.to(device) for key, val in inputs.items()}
+        # out = model.generate(**inputs)
+        # caption1 = processor.decode(out[0], skip_special_tokens=True)
+        # prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
         # generated_image1 =pipe(prompt).images[0]
         # return generated_image, generated_image1
+        messages = [{"role": "user", "content": [{"type": "image"},{"type": "text", "text": "If I had to write a haiku for this one, it would be: "}]}]
+        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(image,input_text,add_special_tokens=False,return_tensors="pt").to(model.device)
+        output = model.generate(**inputs, max_new_tokens=30)
+        caption =processor.decode(output[0])
+        image = pipe(prompt,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
+        return image
     return None
 # Gradio UI
 iface = gr.Interface(