File size: 3,161 Bytes
f667084
 
a04adbd
4d54b56
0372f7c
fa3a39e
 
 
 
a04adbd
264752e
 
 
 
 
0372f7c
 
3891dec
a04adbd
f667084
a04adbd
f667084
 
 
 
 
a04adbd
 
 
264752e
 
 
3891dec
fa3a39e
f418994
 
264752e
f418994
9023169
 
a04adbd
c83e28c
9023169
4d54b56
5ecb8ce
 
0276b97
0372f7c
0276b97
 
 
 
 
 
 
 
 
 
264752e
 
 
 
 
 
0372f7c
264752e
9023169
 
f66faee
9023169
 
01a8184
0372f7c
663d1b9
4878b0f
4d54b56
9023169
 
4d54b56
9023169
45289ed
9023169
 
 
 
 
 
 
 
a81e611
f667084
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from huggingface_hub import login
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

import gradio as gr
from diffusers import DiffusionPipeline
import torch
import spaces  # Hugging Face Spaces module

import requests









# Get Hugging Face Token from environment variable
hf_token = os.getenv('HF_AUTH_TOKEN')
if not hf_token:
    raise ValueError("Hugging Face token is not set in the environment variables.")
login(token=hf_token)

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")


device = "cuda" if torch.cuda.is_available() else "cpu"
pipe.to(device)
model2.to(device)
model.to(device)



@spaces.GPU(duration=150)
def generate_caption_and_image(image):
    img = image.convert("RGB")
    # reader = easyocr.Reader(['en'])  
    # result = reader.readtext(img)
    import random
    

    # Define lists for the three variables
    fabrics = ['cotton', 'silk', 'denim', 'linen', 'polyester', 'wool', 'velvet']
    patterns = ['striped', 'floral', 'geometric', 'abstract', 'solid', 'polka dots']
    textile_designs = ['woven texture', 'embroidery', 'printed fabric', 'hand-dyed', 'quilting']
    
    # Randomly select one from each category
    selected_fabric = random.choice(fabrics)
    selected_pattern = random.choice(patterns)
    selected_textile_design = random.choice(textile_designs)
    text = "a picture of "
    inputs = processor(img, text, return_tensors="pt").to(device)
    
    out = model.generate(**inputs, num_beams = 3)
   
    

    caption2 = processor.decode(out[0], skip_special_tokens=True)
    
    # Generate caption
    inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    out = model.generate(**inputs)
    caption1 = processor.decode(out[0], skip_special_tokens=True)

    prompt = f'''Create a highly realistic clothing item based on the following descriptions: The design should reflect {caption1} and {caption2}, blending both themes into a single, stylish, and modern piece of clothing. Incorporate highly realistic and high-quality textures that exude sophistication, with realistic fabric lighting and fine details. Subtly hint at {selected_fabric}, featuring a {selected_pattern} motif and a {selected_textile_design} style that harmoniously balances the essence of both captions.'''



    # Generate image based on the caption
    generated_image = pipe(prompt).images[0]

    return prompt, generated_image

# Gradio UI
iface = gr.Interface(
    fn=generate_caption_and_image,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=[gr.Textbox(label="Generated Caption"), gr.Image(label="Generated Design")],
    live=True
)
iface.launch(share=True)