Spaces:

vishalkatheriya
/

florence-2-large-VQA-CPU-8bit

Running

App Files Files Community

vishalkatheriya commited on Aug 17

Commit

fad0c74

•

1 Parent(s): 81fe0f6

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -0

app.py CHANGED Viewed

@@ -1,6 +1,52 @@
 import streamlit as st
 from PIL import Image
 import inference
 # Initialize session state to block re-running
 if 'has_run' not in st.session_state:
     st.session_state.has_run = False

 import streamlit as st
 from PIL import Image
 import inference
+from transformers import AutoProcessor, AutoModelForCausalLM
+from PIL import Image
+import requests
+import copy
+import os
+from unittest.mock import patch
+from transformers.dynamic_module_utils import get_imports
+import torch
+#remove flash_attn for load model in cpu
+def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
+    if not str(filename).endswith("modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports
+# Initialize session state for model loading and to block re-running
+if 'model_loaded' not in st.session_state:
+    st.session_state.model_loaded = False
+# Function to load the model (e.g., Florence-2 model)
+def load_model():
+    # Simulate model loading process
+    model_id = "microsoft/Florence-2-large"
+    #processor loading
+    st.session_state.processor = AutoProcessor.from_pretrained(model_id, torch_dtype=torch.qint8, trust_remote_code=True)
+    # Load the model normally
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):  # workaround for unnecessary flash_attn requirement
+        model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", trust_remote_code=True)
+    # Apply dynamic quantization
+    Qmodel = torch.quantization.quantize_dynamic(
+        model, {torch.nn.Linear}, dtype=torch.qint8
+    )
+    del model
+    st.session_state.model = Qmodel
+    st.session_state.model_loaded = True
+    st.write("model loaded complete")
+# Load the model only once
+if not st.session_state.model_loaded:
+    with st.spinner('Loading model...'):
+        load_model()
 # Initialize session state to block re-running
 if 'has_run' not in st.session_state:
     st.session_state.has_run = False