Spaces:

ankitkupadhyay
/

skin_cancer_detection

Sleeping

File size: 2,745 Bytes

5c9bc3a
a1ee699
5c9bc3a
a1ee699
5c9bc3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1ee699
5c9bc3a
a1ee699
5c9bc3a
 
 
a1ee699
5c9bc3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1ee699
5c9bc3a
 
 
a1ee699
 
5c9bc3a
 
 
 
 
 
 
a1ee699

import torch
import torch
import torch.nn as nn
from transformers import ViTImageProcessor, ViTModel, BertTokenizerFast, BertModel
from PIL import Image
import gradio as gr

class VisionLanguageModel(nn.Module):
    def __init__(self):
        super(VisionLanguageModel, self).__init__()
        self.vision_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.language_model = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(
            self.vision_model.config.hidden_size + self.language_model.config.hidden_size,
            2  # Number of classes: benign or malignant
        )

    def forward(self, input_ids, attention_mask, pixel_values):
        vision_outputs = self.vision_model(pixel_values=pixel_values)
        vision_pooled_output = vision_outputs.pooler_output

        language_outputs = self.language_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        language_pooled_output = language_outputs.pooler_output

        combined_features = torch.cat(
            (vision_pooled_output, language_pooled_output),
            dim=1
        )

        logits = self.classifier(combined_features)
        return logits

# Load the model checkpoint with safer loading
model = VisionLanguageModel()
model.load_state_dict(torch.load('best_model.pth', map_location=torch.device('cpu'), weights_only=True))
model.eval()

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
feature_extractor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')

def predict(image, text_input):
    # Preprocess the image
    image = feature_extractor(images=image, return_tensors="pt").pixel_values

    # Preprocess the text
    encoding = tokenizer(
        text_input,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Make a prediction
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask'],
            pixel_values=image
        )
    _, prediction = torch.max(outputs, dim=1)
    return "Malignant" if prediction.item() == 1 else "Benign"

# Define Gradio interface with updated component syntax
iface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(type="pil", label="Upload Skin Lesion Image"),
        gr.Textbox(label="Clinical Information (e.g., patient age, symptoms)")
    ],
    outputs="text",
    title="Skin Lesion Classification Demo",
    description="This model classifies skin lesions as benign or malignant based on an image and clinical information."
)

iface.launch()