File size: 4,708 Bytes
f667084
 
a04adbd
c6c4d1c
4d54b56
90c9be8
0372f7c
fa3a39e
 
 
 
a04adbd
264752e
50b4bab
 
264752e
dcf269f
 
264752e
 
01b1364
 
 
0372f7c
3891dec
a04adbd
f667084
a04adbd
90c9be8
 
 
f2cf99b
f667084
 
 
 
50b4bab
 
c6c4d1c
 
 
 
 
 
 
 
 
 
a04adbd
c6c4d1c
 
 
 
55b9907
 
50b4bab
55b9907
dcf269f
50b4bab
3891dec
fa3a39e
f418994
dcf269f
948eab4
f418994
33407d1
9023169
 
a04adbd
c83e28c
febdafe
8bbdb99
eb4cf9a
 
 
c6c4d1c
eb4cf9a
01b1364
eb4cf9a
 
 
 
 
 
c6c4d1c
 
eb4cf9a
c6c4d1c
eb4cf9a
0c3147e
264752e
c6c4d1c
eb4cf9a
 
c6c4d1c
 
 
 
264752e
c6c4d1c
 
 
 
 
 
eb4cf9a
 
 
 
55b9907
 
 
eb4cf9a
55b9907
c6c4d1c
 
dbf999c
c6c4d1c
 
 
dbf999c
c6c4d1c
6833ac1
9023169
 
 
1b6a75c
0ef1f3f
6833ac1
9023169
 
a81e611
f667084
0ef1f3f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from huggingface_hub import login
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
from dotenv import load_dotenv

import gradio as gr
from diffusers import DiffusionPipeline
import torch
import spaces  # Hugging Face Spaces module

import requests
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

from diffusers import DiffusionPipeline



fabrics = ['cotton', 'silk', 'denim', 'linen', 'polyester', 'wool', 'velvet']
patterns = ['striped', 'floral', 'geometric', 'abstract', 'solid', 'polka dots']
textile_designs = ['woven texture', 'embroidery', 'printed fabric', 'hand-dyed', 'quilting']




# Get Hugging Face Token from environment variable
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")

# Authenticate using the token
login(token ="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")
if not hf_token:
    raise ValueError("Hugging Face token is not set in the environment variables.")
login(token=hf_token)




model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

# Load the processor and model
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
# processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
# model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
# pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
from diffusers import FluxPipeline

pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)




device = "cuda" if torch.cuda.is_available() else "cpu"
# pipe.to(device)

model.to(device)
pipe.to(device)



@spaces.GPU(duration=150)
def generate_caption_and_image(image, f, p, d):
    if f!=None and p!=None and d!=None and image!=None:
        img = image.convert("RGB")
        # reader = easyocr.Reader(['en'])  
        # # result = reader.readtext(img)
        # import random
        
    
    
    
    
            
    
     
        # text = "a picture of "
        # inputs = processor(img, text, return_tensors="pt").to(device)
        
        # out = model2.generate(**inputs, num_beams = 3)
       
        
    
        # caption2 = processor.decode(out[0], skip_special_tokens=True)
        
        # Generate caption
        # inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
        # inputs = {key: val.to(device) for key, val in inputs.items()}
        # out = model.generate(**inputs)
        # caption1 = processor.decode(out[0], skip_special_tokens=True)
    
        # prompt = f"Design a high-quality, stylish clothing item that seamlessly blends the essence of {caption1} and {caption2}. The design should prominently feature {f}{d} and incorporate {p}. The final piece should exude sophistication and creativity, suitable for modern trends while retaining an element of timeless appeal. Ensure the textures and patterns complement each other harmoniously, creating a visually striking yet wearable garment."




        
    
    
      
    
        # # Generate image based on the caption
        # generated_image = pipe(prompt).images[0]
        # generated_image1 =pipe(prompt).images[0]
    
        # return generated_image, generated_image1
        messages = [{"role": "user", "content": [{"type": "image"},{"type": "text", "text": "If I had to write a haiku for this one, it would be: "}]}]
        input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = processor(img,input_text,add_special_tokens=False,return_tensors="pt").to(device)
        
        output = model.generate(**inputs, max_new_tokens=30)
        caption =processor.decode(output[0])
        image = pipe(caption,height=1024,width=1024,guidance_scale=3.5,num_inference_steps=50,max_sequence_length=512,generator=torch.Generator("cpu").manual_seed(0)).images[0]
        return image
    return None
# Gradio UI
iface = gr.Interface(
    fn=generate_caption_and_image,
    inputs=[gr.Image(type="pil", label="Upload Image"), gr.Radio(fabrics, label="Select Fabric"), gr.Radio(patterns, label="Select Pattern"), gr.Radio(textile_designs, label="Select Textile Design")],
            
    outputs=[gr.Image(label="Generated Design 1")],
    live=True
)
iface.launch(share=True)