File size: 4,616 Bytes
f667084 a04adbd 4d54b56 0372f7c fa3a39e a04adbd 264752e 50b4bab 264752e 0372f7c 3891dec a04adbd f667084 a04adbd f667084 50b4bab a04adbd 264752e 50b4bab 3891dec fa3a39e f418994 264752e f418994 9023169 a04adbd c83e28c 9023169 4d54b56 5ecb8ce 0276b97 50b4bab 0c3147e 0276b97 264752e 0372f7c 264752e 9023169 f66faee 9023169 01a8184 0372f7c 0c3147e 4878b0f 4d54b56 9023169 4d54b56 9023169 45289ed 9023169 a81e611 f667084 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
from huggingface_hub import login
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import gradio as gr
from diffusers import DiffusionPipeline
import torch
import spaces # Hugging Face Spaces module
import requests
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
# Get Hugging Face Token from environment variable
hf_token = os.getenv('HF_AUTH_TOKEN')
if not hf_token:
raise ValueError("Hugging Face token is not set in the environment variables.")
login(token=hf_token)
# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap")
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
model3 =model = Qwen2VLForConditionalGeneration.from_pretrained(
"prithivMLmods/Qwen2-VL-OCR-2B-Instruct", torch_dtype="auto", device_map="auto"
)
processor2 = AutoProcessor.from_pretrained("prithivMLmods/Qwen2-VL-OCR-2B-Instruct")
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe.to(device)
model2.to(device)
model.to(device)
@spaces.GPU(duration=150)
def generate_caption_and_image(image):
img = image.convert("RGB")
# reader = easyocr.Reader(['en'])
# result = reader.readtext(img)
import random
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": img},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
result = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
# Define lists for the three variables
fabrics = ['cotton', 'silk', 'denim', 'linen', 'polyester', 'wool', 'velvet']
patterns = ['striped', 'floral', 'geometric', 'abstract', 'solid', 'polka dots']
textile_designs = ['woven texture', 'embroidery', 'printed fabric', 'hand-dyed', 'quilting']
# Randomly select one from each category
selected_fabric = random.choice(fabrics)
selected_pattern = random.choice(patterns)
selected_textile_design = random.choice(textile_designs)
text = "a picture of "
inputs = processor(img, text, return_tensors="pt").to(device)
out = model.generate(**inputs, num_beams = 3)
caption2 = processor.decode(out[0], skip_special_tokens=True)
# Generate caption
inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
inputs = {key: val.to(device) for key, val in inputs.items()}
out = model.generate(**inputs)
caption1 = processor.decode(out[0], skip_special_tokens=True)
prompt = f'''Create a highly realistic clothing item based on the following descriptions: The design should reflect {caption1} and {caption2}, blending both themes into a single, stylish, and modern piece of clothing. Incorporate highly realistic and high-quality textures that exude sophistication, with realistic fabric lighting and fine details. Subtly hint at {selected_fabric}, featuring a {selected_pattern} motif and a {selected_textile_design} style that harmoniously balances the essence of both captions.and {result} should be written on top of it'''
# Generate image based on the caption
generated_image = pipe(prompt).images[0]
return prompt, generated_image
# Gradio UI
iface = gr.Interface(
fn=generate_caption_and_image,
inputs=gr.Image(type="pil", label="Upload Image"),
outputs=[gr.Textbox(label="Generated Caption"), gr.Image(label="Generated Design")],
live=True
)
iface.launch(share=True)
|