Fiqa commited on
Commit
01a8184
·
verified ·
1 Parent(s): 45289ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -13
app.py CHANGED
@@ -8,7 +8,6 @@ from diffusers import DiffusionPipeline
8
  import torch
9
  import spaces # Hugging Face Spaces module
10
 
11
- from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
12
 
13
 
14
 
@@ -24,15 +23,12 @@ login(token=hf_token)
24
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
25
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
26
 
27
- model2 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
28
- feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
29
- tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
30
 
31
 
32
  pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
  pipe.to(device)
35
- model2.to(device)
36
  model.to(device)
37
 
38
 
@@ -54,21 +50,15 @@ def generate_caption_and_image(image):
54
  selected_fabric = random.choice(fabrics)
55
  selected_pattern = random.choice(patterns)
56
  selected_textile_design = random.choice(textile_designs)
57
-
58
- gen_kwargs = {"max_length": 16, "num_beams": 4}
59
-
60
-
61
- pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values.to(device)
62
- output_ids = model.generate(pixel_values, **gen_kwargs)
63
- caption1 = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
64
 
65
 
 
66
 
67
  # Generate caption
68
  inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
69
  inputs = {key: val.to(device) for key, val in inputs.items()}
70
  out = model.generate(**inputs)
71
- caption2 = processor.decode(out[0], skip_special_tokens=True)
72
 
73
  prompt = f'''Create a highly realistic clothing item based on the following descriptions: The design should reflect {caption1} and {caption2}, blending both themes into a single, stylish, and modern piece of clothing. Incorporate highly realistic and high-quality textures that exude sophistication, with realistic fabric lighting and fine details. Subtly hint at {selected_fabric}, featuring a {selected_pattern} motif and a {selected_textile_design} style that harmoniously balances the essence of both captions.'''
74
 
 
8
  import torch
9
  import spaces # Hugging Face Spaces module
10
 
 
11
 
12
 
13
 
 
23
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
24
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
25
 
 
 
 
26
 
27
 
28
  pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium")
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  pipe.to(device)
31
+
32
  model.to(device)
33
 
34
 
 
50
  selected_fabric = random.choice(fabrics)
51
  selected_pattern = random.choice(patterns)
52
  selected_textile_design = random.choice(textile_designs)
 
 
 
 
 
 
 
53
 
54
 
55
+ caption2 =""
56
 
57
  # Generate caption
58
  inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250)
59
  inputs = {key: val.to(device) for key, val in inputs.items()}
60
  out = model.generate(**inputs)
61
+ caption1 = processor.decode(out[0], skip_special_tokens=True)
62
 
63
  prompt = f'''Create a highly realistic clothing item based on the following descriptions: The design should reflect {caption1} and {caption2}, blending both themes into a single, stylish, and modern piece of clothing. Incorporate highly realistic and high-quality textures that exude sophistication, with realistic fabric lighting and fine details. Subtly hint at {selected_fabric}, featuring a {selected_pattern} motif and a {selected_textile_design} style that harmoniously balances the essence of both captions.'''
64