forked and merged

Browse files

Files changed (8) hide show

README.md +93 -3
config.json +170 -0
handler.py +46 -0
preprocessor_config.json +25 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +21 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,93 @@
----
-license: bsd-3-clause
----

+---
+tags:
+- image-to-text
+- image-captioning
+- endpoints-template
+license: bsd-3-clause
+library_name: generic
+---
+# Fork of [ckandemir/blip-image-captioning-large-inference](https://huggingface.co/ckandemir/blip-image-captioning-large-inference) which is a fork of [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) for a `image-captioning` task on 🤗Inference endpoint.
+This repository implements a `custom` task for `image-captioning` for 🤗 Inference Endpoints. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/florentgbelidji/blip_captioning/blob/main/pipeline.py).
+To use deploy this model a an Inference Endpoint you have to select `Custom` as task to use the `handler.py` file. -> _double check if it is selected_
+### expected Request payload
+```json
+{
+  "image": "/9j/4AAQSkZJRgA.....", #encoded image
+  "text": "a photography of a"
+}
+```
+below is an example on how to run a request using Python and `requests`.
+## Run Request
+1. Use any online  image.
+```bash
+!wget https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg
+```
+2.run request
+```python
+import json
+from typing import List
+import requests as r
+import base64
+with open("/content/demo.jpg", "rb") as image_file:
+    encoded_string = base64.b64encode(image_file.read()).decode()
+ENDPOINT_URL = ""
+HF_TOKEN = ""
+def query(payload):
+	response = requests.post(API_URL, headers=headers, json=payload)
+	return response.json()
+output = query({
+    "inputs": {
+        "images": [encoded_string],  # using the base64 encoded string
+        "texts": ["a photography of"]  # Optional, based on your current class logic
+    }
+})
+print(output)
+```
+Example parameters depending on the decoding strategy:
+1. Beam search
+```
+        "parameters": {
+                   "num_beams":5,
+                   "max_length":20
+        }
+```
+2. Nucleus sampling
+```
+        "parameters": {
+                   "num_beams":1,
+                   "max_length":20,
+                   "do_sample": True,
+                   "top_k":50,
+                   "top_p":0.95
+        }
+```
+3. Contrastive search
+```
+        "parameters": {
+                   "penalty_alpha":0.6,
+                   "top_k":4
+                   "max_length":512
+        }
+```
+See [generate()](https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) doc for additional detail
+expected output
+```python
+{'captions': ['a photography of a woman and her dog on the beach']}
+```

config.json ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+  "_commit_hash": null,
+  "architectures": [
+    "BlipForConditionalGeneration"
+  ],
+  "image_text_hidden_size": 256,
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "blip",
+  "projection_dim": 512,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 30522,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_hidden_size": 1024,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "blip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": 102,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.26.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 30524
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 384,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "blip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 16,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.26.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  }
+}

handler.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import requests
+from typing import Dict, Any
+from PIL import Image
+import torch
+import base64
+from io import BytesIO
+from transformers import BlipForConditionalGeneration, BlipProcessor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+class EndpointHandler():
+    def __init__(self, path=""):
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        self.model = BlipForConditionalGeneration.from_pretrained(
+            "Salesforce/blip-image-captioning-large"
+        ).to(device)
+        self.model.eval()
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        input_data = data.get("inputs", {})
+        encoded_images = input_data.get("images")
+        if not encoded_images:
+            return {"captions": [], "error": "No images provided"}
+        texts = input_data.get("texts", ["a photography of"] * len(encoded_images))
+        try:
+            raw_images = [Image.open(BytesIO(base64.b64decode(img))).convert("RGB") for img in encoded_images]
+            processed_inputs = [
+                self.processor(image, text, return_tensors="pt") for image, text in zip(raw_images, texts)
+            ]
+            processed_inputs = {
+                "pixel_values": torch.cat([inp["pixel_values"] for inp in processed_inputs], dim=0).to(device),
+                "input_ids": torch.cat([inp["input_ids"] for inp in processed_inputs], dim=0).to(device),
+                "attention_mask": torch.cat([inp["attention_mask"] for inp in processed_inputs], dim=0).to(device)
+            }
+            with torch.no_grad():
+                out = self.model.generate(**processed_inputs)
+            captions = self.processor.batch_decode(out, skip_special_tokens=True)
+            return {"captions": captions}
+        except Exception as e:
+            print(f"Error during processing: {str(e)}")
+            return {"captions": [], "error": str(e)}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "do_normalize": true,
+  "do_pad": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "BlipImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "BlipProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 384,
+    "width": 384
+  },
+  "size_divisor": 32
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "Salesforce/blip-image-captioning-large",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "processor_class": "BlipProcessor",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ]
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff