# Image to text for Airbnb images

In [1]:
import torch
import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
from transformers import AutoProcessor
import numpy as np
from torchvision import transforms
from transformers import BlipForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


### Create dataset with images and text and process them with BLIP's processor

In [2]:
class Airbnb(Dataset):
    def __init__(self, csv_file, data_augmentation):
        self.df = pd.read_csv(csv_file)
        self.processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        path_to_im = "/home/cassandra@myliser.lu/image_to_text/blip/living_room/" + str(self.df.listing_id_x[index])+ '_' + str(self.df.photo_number_x[index])
        image = Image.open(path_to_im).convert("RGB")
        label = str(self.df.answers[index])
        encoding = self.processor(images=image, text=label, padding="max_length", return_tensors="pt")
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        return encoding

### Import CSV file

In [3]:
csv_file = "/home/cassandra@myliser.lu/image_to_text/blip/Picture_Descriptions_All-Copy.csv"

In [4]:
dataset = Airbnb(csv_file, data_augmentation = None)

### Split train/test dataset

In [5]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

### Create dataloader

In [6]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=1,
        shuffle=True
    )
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=True
    )

### Import model and create device

In [7]:
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Train loop

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.to(device)
model.train()
for epoch in range(5):
    print("Epoch:", epoch)
    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    for idx, batch in enumerate(train_loader):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        labels = input_ids

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        predictions = torch.argmax(outputs.logits, dim=-1)
        correct = (predictions == labels).sum().item()
        total_correct += correct
        total_examples += labels.numel()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    average_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_examples
    print(f"Average Loss for epoch {epoch}: {average_loss:.4f}")
    print(f"Accuracy for epoch {epoch}: {accuracy:.2f}")

Epoch: 0


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: 

### Test loop

In [None]:
model.eval()
with torch.no_grad():
    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    for idx, batch in enumerate(test_loader):
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        labels = input_ids

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        predictions = torch.argmax(outputs.logits, dim=-1)
        correct = (predictions == labels).sum().item()
        total_correct += correct
        total_examples += labels.numel()

    average_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_examples
    print(f"Test Average Loss: {average_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.2f}")

In [10]:
pip install huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


### Gradio webapp

In [None]:
import gradio as gr
from gradio.components import Label

ModuleNotFoundError: No module named 'gradio'

In [None]:
model.eval()  # Mettez votre modèle en mode évaluation

# Fonction d'inférence pour Gradio
def predict(image):
  processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
  inputs = processor(images=image, return_tensors="pt").to(device)
  pixel_values = inputs.pixel_values

  generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Création de l'interface Gradio
iface = gr.Interface(fn=predict,
                     inputs=gr.components.Textbox(placeholder="Enter your text here..."),
                     outputs=gr.components.Label(num_top_classes=2))
iface.launch(share=True)