Spaces:

Fiqa
/

StyleSync

Runtime error

App Files Files Community

Fiqa commited on Jan 5

Commit

3aefc04

verified ·

1 Parent(s): 19bc041

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -39

app.py CHANGED Viewed

@@ -34,21 +34,12 @@ login(token =HUGGINGFACE_TOKEN)
-model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = MllamaForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
-processor = AutoProcessor.from_pretrained(model_id)
-# Load the processor and model
-# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-# processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
-# model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
-# pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
 from diffusers import FluxPipeline
 pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
@@ -61,6 +52,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 pipe.to(device)
@@ -79,22 +71,22 @@ def generate_caption_and_image(image, f, p, d):
-        # text = "a picture of "
-        # inputs = processor(img, text, return_tensors="pt").to(device)
-        # out = model2.generate(**inputs, num_beams = 3)
-        # caption2 = processor.decode(out[0], skip_special_tokens=True)
-        # Generate caption
-        # inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
-        # inputs = {key: val.to(device) for key, val in inputs.items()}
-        # out = model.generate(**inputs)
-        # caption1 = processor.decode(out[0], skip_special_tokens=True)
-        # prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
@@ -102,20 +94,8 @@ def generate_caption_and_image(image, f, p, d):
-        # # Generate image based on the caption
-        # generated_image = pipe(prompt).images[0]
-        # generated_image1 =pipe(prompt).images[0]
-        # return generated_image, generated_image1
-        messages = [{"role": "user", "content": [{"type": "image"},{"type": "text", "text": "If I had to write a haiku for this one, it would be: "}]}]
-        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = processor(img,input_text,add_special_tokens=False,return_tensors="pt").to(device)
-        output = model.generate(**inputs, max_new_tokens=30)
-        caption =processor.decode(output[0])
-        image = pipe(caption,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
         return image
     return None
 # Gradio UI

+Load the processor and model
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
+model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
 from diffusers import FluxPipeline
 pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
 model.to(device)
 pipe.to(device)
+model2.to(device)
+        text = "a picture of "
+        inputs = processor(img, text, return_tensors="pt").to(device)
+        out = model2.generate(**inputs, num_beams = 3)
+        caption2 = processor.decode(out[0], skip_special_tokens=True)
+        Generate caption
+        inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
+        inputs = {key: val.to(device) for key, val in inputs.items()}
+        out = model.generate(**inputs)
+        caption1 = processor.decode(out[0], skip_special_tokens=True)
+        prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."
+        image = pipe(prompt,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
         return image
     return None
 # Gradio UI