File size: 3,034 Bytes
ae0ae97 1b2c149 254e461 35edfe0 254e461 801db1d 254e461 801db1d 254e461 801db1d fd8d179 801db1d d688d11 801db1d a312e49 fd8d179 801db1d d688d11 fd8d179 d688d11 fd8d179 d688d11 b66257a d688d11 036f146 b66257a d688d11 b66257a d688d11 036f146 b5f0d2f 801db1d b66257a b5f0d2f 801db1d b5f0d2f ae0ae97 b5f0d2f 036f146 b5f0d2f 5a003d2 d688d11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
from PIL import Image
import numpy as np
import torch
import open_clip
#from transformers import CLIPProcessor, CLIPModel
knnpath = '20241204-ams-no-env-open_clip_ViT-H-14-378-quickgelu.npz'
clip_model_name = 'ViT-H-14-378-quickgelu'
pretrained_name = 'dfn5b'
# Set page config
st.set_page_config(
page_title="Percept",
layout="wide"
)
#model, preprocess = open_clip.create_model_from_pretrained('hf-hub:laion/CLIP-ViT-g-14-laion2B-s12B-b42K')
#tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-g-14-laion2B-s12B-b42K')
#model, preprocess = open_clip.create_model_from_pretrained(clip_model_name)
#tokenizer = open_clip.get_tokenizer(clip_model_name)
#st.write("Available models:", open_clip.list_models())
@st.cache_resource
def load_model():
"""Load the OpenCLIP model and return model and processor"""
model, _, preprocess = open_clip.create_model_and_transforms(
clip_model_name, pretrained=pretrained_name
)
tokenizer = open_clip.get_tokenizer(clip_model_name)
return model, preprocess, tokenizer
def process_image(image, preprocess):
"""Process image and return tensor"""
if isinstance(image, str):
# If image is a URL
response = requests.get(image)
image = Image.open(BytesIO(response.content))
# Ensure image is in RGB mode
if image.mode != 'RGB':
image = image.convert('RGB')
processed_image = preprocess(image).unsqueeze(0)
return processed_image
@st.cache_resource
def load_knn():
return np.load(knnpath)
def main():
st.title("Percept: Human Perception of Street View Image Analyzer")
try:
with st.spinner('Loading CLIP model... This may take a moment.'):
model, preprocess, tokenizer = load_model()
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
except Exception as e:
st.error(f"Error loading model: {str(e)}")
st.info("Please make sure you have enough memory and the correct dependencies installed.")
with st.spinner('Loading KNN model... This may take a moment.'):
knn = load_knn()
st.write(knn['walkability_vecs'].shape)
file = st.file_uploader('Upload An Image')
if file:
try:
image = Image.open(file)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Process image
with st.spinner('Processing image...'):
processed_image = process_image(image, preprocess)
processed_image = processed_image.to(device)
# Encode into CLIP vector
with torch.no_grad():
vec = model.encode_image(processed_image)
# Normalize vector
vec /= vec.norm(dim=-1, keepdim=True)
st.write(vec.shape)
except Exception as e:
st.error(f"Error processing image: {str(e)}")
if __name__ == "__main__":
main()
|