DPR-5gee

Sleeping

File size: 2,154 Bytes

6b0a154
7d8ec5e
4bdf8dd
 
c4e3ea5
4bdf8dd
a21c2eb
 
4bdf8dd
 
 
c4e3ea5
2e55271
 
 
 
 
 
 
 
4bdf8dd
ef4e447
 
6b0a154
ef4e447
 
 
 
 
 
 
 
2e55271
ef4e447
 
 
 
 
 
 
 
2e55271
ef4e447
6b0a154
4bdf8dd
 
ef4e447
a52ad47
4bdf8dd
 
ef4e447
6c9bf3f
c4e3ea5
 
4bdf8dd

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import gradio as gr
import torch

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# List of construction-related terms
construction_terms = [
    "concrete", "scaffolding", "steel rods", "piling", "excavation", 
    "mixer", "cement", "brickwork", "crane", "rebar", "construction", 
    "foundation", "building", "formwork", "drywall", "steel beams", 
    "hammer", "saw", "nails", "jackhammer"
]

# Inference function
def generate_captions(images):
    captions = []
    
    for image in images:
        if image.mode != "RGB":
            image = image.convert("RGB")
        
        # Preprocess the image and generate a caption
        inputs = processor(image, return_tensors="pt").to(device, torch.float16)
        output = model.generate(**inputs, max_new_tokens=50)
        caption = processor.decode(output[0], skip_special_tokens=True)

        # Filter the caption to only include construction-related terms
        filtered_caption = " ".join([word for word in caption.split() if word.lower() in construction_terms])
        
        # If no construction-related terms are found, return a default message
        if not filtered_caption:
            filtered_caption = "No construction-related activities detected."
        
        captions.append(filtered_caption)
    
    return captions

# Gradio interface
iface = gr.Interface(
    fn=generate_captions,
    inputs=gr.Image(type="pil", label="Upload Site Photos", multiple=True),  # Handle batch upload of up to 10 images
    outputs="text",
    title="Construction Site Image-to-Text Generator",
    description="Upload up to 10 site photos. The model will detect and describe construction activities and materials (e.g., concrete pouring, scaffolding, steel rods).",
    allow_flagging="never"  # Disable flagging (optional)
)

iface.launch()