File size: 3,034 Bytes
ae0ae97
1b2c149
254e461
35edfe0
254e461
801db1d
254e461
801db1d
254e461
801db1d
 
fd8d179
801db1d
d688d11
 
 
 
 
 
 
801db1d
 
 
a312e49
 
 
fd8d179
801db1d
d688d11
 
 
 
fd8d179
d688d11
fd8d179
d688d11
 
 
 
 
 
 
 
 
 
 
 
 
 
b66257a
 
 
 
d688d11
036f146
b66257a
d688d11
b66257a
d688d11
036f146
 
b5f0d2f
 
 
801db1d
b66257a
 
b5f0d2f
801db1d
b5f0d2f
ae0ae97
b5f0d2f
 
036f146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5f0d2f
 
5a003d2
d688d11
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import streamlit as st
from PIL import Image
import numpy as np
import torch

import open_clip

#from transformers import CLIPProcessor, CLIPModel

knnpath = '20241204-ams-no-env-open_clip_ViT-H-14-378-quickgelu.npz'
clip_model_name = 'ViT-H-14-378-quickgelu'
pretrained_name = 'dfn5b'


# Set page config
st.set_page_config(
    page_title="Percept",
    layout="wide"
)

#model, preprocess = open_clip.create_model_from_pretrained('hf-hub:laion/CLIP-ViT-g-14-laion2B-s12B-b42K')
#tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-g-14-laion2B-s12B-b42K')

#model, preprocess = open_clip.create_model_from_pretrained(clip_model_name)
#tokenizer = open_clip.get_tokenizer(clip_model_name)

#st.write("Available models:", open_clip.list_models())

@st.cache_resource
def load_model():
    """Load the OpenCLIP model and return model and processor"""
    model, _, preprocess = open_clip.create_model_and_transforms(
        clip_model_name, pretrained=pretrained_name
    )
    tokenizer = open_clip.get_tokenizer(clip_model_name)
    return model, preprocess, tokenizer

def process_image(image, preprocess):
    """Process image and return tensor"""
    if isinstance(image, str):
        # If image is a URL
        response = requests.get(image)
        image = Image.open(BytesIO(response.content))
    # Ensure image is in RGB mode
    if image.mode != 'RGB':
        image = image.convert('RGB')
    processed_image = preprocess(image).unsqueeze(0)
    return processed_image

@st.cache_resource
def load_knn():
    return np.load(knnpath)

def main():
    st.title("Percept: Human Perception of Street View Image Analyzer")
 
    try:
        with st.spinner('Loading CLIP model... This may take a moment.'):
            model, preprocess, tokenizer = load_model()
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        st.info("Please make sure you have enough memory and the correct dependencies installed.")

    with st.spinner('Loading KNN model... This may take a moment.'):
        knn = load_knn()
    st.write(knn['walkability_vecs'].shape)

    file = st.file_uploader('Upload An Image')

    if file:
        try:
            image = Image.open(file)

            st.image(image, caption="Uploaded Image", use_column_width=True)
                
            # Process image
            with st.spinner('Processing image...'):
                processed_image = process_image(image, preprocess)
                processed_image = processed_image.to(device)

                # Encode into CLIP vector
                with torch.no_grad():
                    vec = model.encode_image(processed_image)
                    
                    # Normalize vector
                    vec /= vec.norm(dim=-1, keepdim=True)
                st.write(vec.shape)
        except Exception as e:
            st.error(f"Error processing image: {str(e)}")

if __name__ == "__main__":
    main()