Upyaya commited on
Commit
f6f902a
·
1 Parent(s): 9a4177f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -45
app.py CHANGED
@@ -45,65 +45,66 @@ def init_model():
45
 
46
  return processor, model
47
 
48
- def main():
49
 
50
- st.header("Automate Fashion Image Captioning using BLIP-2")
51
- st.caption("The fashion industry is worth trillions of dollars. The goal of any company/seller is to help customer tofind the right product from a huge corpus of products that they are searching for.")
52
- st.caption("So, when customer find the right product they are mostly going to add the item to their cart and which help in company revenue.")
53
- st.caption("Accurate and enchanting descriptions of clothes on shopping websites can help customers without fashion knowledge to better understand the features (attributes, style, functionality, etc.) of the items and increase online sales by enticing more customers.")
54
- st.caption("Also, most of the time when any customer visits shopping websites, they are looking for a certain style or type of clothes that wish to purchase, they search for the item by providing a description of the item and the system finds the relevant items that match the search query by computing the similarity score between the query and the item caption.")
55
- st.caption("Given the clothes image provide a short caption that describes the item. In general, in image captioning datasets (e.g., COCO, Fliker), the descriptions of fashion items have three unique features, which makes the automatic generation of captions a challenging task. First, fashion captioning needs to describe the attributes of an item, while image captioning generally narrates the objects and their relations in the image.")
56
- st.caption("Solution: Used Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models [(BLIP-2)](https://huggingface.co/Salesforce/blip2-opt-2.7b) by Salesforce")
57
- st.write("For more detail: [Github link](https://github.com/SmithaUpadhyaya/fashion_image_caption)")
58
 
59
- processor, model = init_model()
 
 
 
 
60
 
61
- #Select few sample images for the catagory of cloths
62
- st.caption("Select image:")
63
- option = st.selectbox('From sample', ('None', 'dress', 'earrings', 'sweater', 'sunglasses', 'shoe', 'hat', 'heels', 'socks', 'tee', 'bracelet'), index = 0)
64
- st.text("Or")
65
- file_name = st.file_uploader(label = "Upload an image", accept_multiple_files = False)
66
-
67
- btn_click = st.button('Generate')
68
 
69
- if btn_click:
70
 
71
- image = None
72
- if file_name is not None:
73
 
74
- image = Image.open(file_name)
75
 
76
- elif option is not 'None':
77
 
78
- file_name = os.path.join(sample_img_path, map_sampleid_name[option])
79
- image = Image.open(file_name)
80
 
81
- if image is not None:
82
 
83
- with st.spinner('Generating Caption...'):
 
84
 
85
- image_col, caption_text = st.columns(2)
86
- image_col.header("Image")
87
- image_col.image(image, use_column_width = True)
88
 
89
- #Preprocess the image
90
- #Inferance on GPU. When used this on GPU will get errors like: "slow_conv2d_cpu" not implemented for 'Half'" , " Input type (float) and bias type (struct c10::Half)"
91
- #inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
92
 
93
- #Inferance on CPU
94
- inputs = processor(images = image, return_tensors = "pt")
 
95
 
96
- pixel_values = inputs.pixel_values
 
97
 
98
- #Predict the caption for the imahe
99
- generated_ids = model.generate(pixel_values = pixel_values, max_length = 25)
100
- generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
101
 
102
- #Output the predict text
103
- caption_text.header("Generated Caption")
104
- caption_text.text(generated_caption)
105
-
106
- st.write("Application deployed on CPU basic with 16GB RAM")
107
 
108
- if __name__ == "__main__":
109
- main()
 
 
 
 
 
 
 
45
 
46
  return processor, model
47
 
48
+ #def main():
49
 
50
+ st.header("Automate Fashion Image Captioning using BLIP-2")
51
+ st.caption("The fashion industry is worth trillions of dollars. The goal of any company/seller is to help customer tofind the right product from a huge corpus of products that they are searching for.")
52
+ st.caption("So, when customer find the right product they are mostly going to add the item to their cart and which help in company revenue.")
53
+ st.caption("Accurate and enchanting descriptions of clothes on shopping websites can help customers without fashion knowledge to better understand the features (attributes, style, functionality, etc.) of the items and increase online sales by enticing more customers.")
54
+ st.caption("Also, most of the time when any customer visits shopping websites, they are looking for a certain style or type of clothes that wish to purchase, they search for the item by providing a description of the item and the system finds the relevant items that match the search query by computing the similarity score between the query and the item caption.")
55
+ st.caption("Given the clothes image provide a short caption that describes the item. In general, in image captioning datasets (e.g., COCO, Fliker), the descriptions of fashion items have three unique features, which makes the automatic generation of captions a challenging task. First, fashion captioning needs to describe the attributes of an item, while image captioning generally narrates the objects and their relations in the image.")
56
+ st.caption("Solution: Used Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models [(BLIP-2)](https://huggingface.co/Salesforce/blip2-opt-2.7b) by Salesforce")
57
+ st.caption("For more detail: [Github link](https://github.com/SmithaUpadhyaya/fashion_image_caption)") #write
58
 
59
+ #Select few sample images for the catagory of cloths
60
+ st.caption("Select image:")
61
+ option = st.selectbox('From sample', ('None', 'dress', 'earrings', 'sweater', 'sunglasses', 'shoe', 'hat', 'heels', 'socks', 'tee', 'bracelet'), index = 0)
62
+ st.text("Or")
63
+ file_name = st.file_uploader(label = "Upload an image", accept_multiple_files = False)
64
 
65
+ btn_click = st.button('Generate')
 
 
 
 
 
 
66
 
67
+ if btn_click:
68
 
69
+ image = None
70
+ if file_name is not None:
71
 
72
+ image = Image.open(file_name)
73
 
74
+ elif option is not 'None':
75
 
76
+ file_name = os.path.join(sample_img_path, map_sampleid_name[option])
77
+ image = Image.open(file_name)
78
 
79
+ if image is not None:
80
 
81
+ with st.spinner('Initializing model...'):
82
+ processor, model = init_model()
83
 
84
+ with st.spinner('Generating Caption...'):
 
 
85
 
86
+ image_col, caption_text = st.columns(2)
87
+ image_col.header("Image")
88
+ image_col.image(image, use_column_width = True)
89
 
90
+ #Preprocess the image
91
+ #Inferance on GPU. When used this on GPU will get errors like: "slow_conv2d_cpu" not implemented for 'Half'" , " Input type (float) and bias type (struct c10::Half)"
92
+ #inputs = processor(images = image, return_tensors = "pt").to('cuda', torch.float16)
93
 
94
+ #Inferance on CPU
95
+ inputs = processor(images = image, return_tensors = "pt")
96
 
97
+ pixel_values = inputs.pixel_values
 
 
98
 
99
+ #Predict the caption for the imahe
100
+ generated_ids = model.generate(pixel_values = pixel_values, max_length = 25)
101
+ generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
102
 
103
+ #Output the predict text
104
+ caption_text.header("Generated Caption")
105
+ caption_text.text(generated_caption)
106
+
107
+ st.caption("Application deployed on CPU basic with 16GB RAM")
108
+
109
+ #if __name__ == "__main__":
110
+ # main()