bulentsoykan commited on
Commit
00821bd
Β·
verified Β·
1 Parent(s): 6a1b293

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -9
app.py CHANGED
@@ -7,7 +7,7 @@ import os
7
 
8
  # Page configuration
9
  st.set_page_config(
10
- page_title="Gemma-3 OCR",
11
  page_icon="πŸ”Ž",
12
  layout="wide",
13
  initial_sidebar_state="expanded"
@@ -20,7 +20,8 @@ if not HF_API_KEY:
20
 
21
  # Hugging Face API function
22
  def process_image_with_hf(image_bytes):
23
- API_URL = "https://api-inference.huggingface.co/models/google/gemma-3-vision"
 
24
  headers = {"Authorization": f"Bearer {HF_API_KEY}"}
25
 
26
  # Convert image to base64
@@ -43,17 +44,23 @@ def process_image_with_hf(image_bytes):
43
  if response.status_code != 200:
44
  raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
45
 
46
- return response.json()[0]["generated_text"]
 
 
 
 
 
 
47
 
48
  # Title and description in main area
49
  try:
50
  # Try to load the image from assets folder
51
  st.markdown("""
52
- # <img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Gemma-3 OCR
53
  """.format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True)
54
  except FileNotFoundError:
55
  # Fallback if image doesn't exist
56
- st.title("Gemma-3 OCR")
57
 
58
  # Add clear button to top right
59
  col1, col2 = st.columns([6,1])
@@ -63,11 +70,27 @@ with col2:
63
  del st.session_state['ocr_result']
64
  st.rerun()
65
 
66
- st.markdown('<p style="margin-top: -20px;">Extract structured text from images using Gemma-3 Vision!</p>', unsafe_allow_html=True)
67
  st.markdown("---")
68
 
69
- # Move upload controls to sidebar
70
  with st.sidebar:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  st.header("Upload Image")
72
  uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg'])
73
 
@@ -81,8 +104,11 @@ with st.sidebar:
81
  st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.")
82
  else:
83
  if st.button("Extract Text πŸ”", type="primary"):
84
- with st.spinner("Processing image..."):
85
  try:
 
 
 
86
  # Get image bytes
87
  img_bytes = uploaded_file.getvalue()
88
 
@@ -91,6 +117,7 @@ with st.sidebar:
91
  st.session_state['ocr_result'] = result
92
  except Exception as e:
93
  st.error(f"Error processing image: {str(e)}")
 
94
 
95
  # Main content area for results
96
  if 'ocr_result' in st.session_state:
@@ -100,4 +127,4 @@ else:
100
 
101
  # Footer
102
  st.markdown("---")
103
- st.markdown("Made with using Gemma-3 Vision Model | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")
 
7
 
8
  # Page configuration
9
  st.set_page_config(
10
+ page_title="Vision OCR",
11
  page_icon="πŸ”Ž",
12
  layout="wide",
13
  initial_sidebar_state="expanded"
 
20
 
21
  # Hugging Face API function
22
  def process_image_with_hf(image_bytes):
23
+ # Use an available multimodal model that can handle images and text
24
+ API_URL = "https://api-inference.huggingface.co/models/llava-hf/llava-1.5-7b-hf"
25
  headers = {"Authorization": f"Bearer {HF_API_KEY}"}
26
 
27
  # Convert image to base64
 
44
  if response.status_code != 200:
45
  raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
46
 
47
+ # Handle different response formats
48
+ if isinstance(response.json(), list):
49
+ return response.json()[0]["generated_text"]
50
+ elif isinstance(response.json(), dict) and "generated_text" in response.json():
51
+ return response.json()["generated_text"]
52
+ else:
53
+ return str(response.json())
54
 
55
  # Title and description in main area
56
  try:
57
  # Try to load the image from assets folder
58
  st.markdown("""
59
+ # <img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Vision OCR
60
  """.format(base64.b64encode(open("./assets/gemma3.png", "rb").read()).decode()), unsafe_allow_html=True)
61
  except FileNotFoundError:
62
  # Fallback if image doesn't exist
63
+ st.title("Vision OCR")
64
 
65
  # Add clear button to top right
66
  col1, col2 = st.columns([6,1])
 
70
  del st.session_state['ocr_result']
71
  st.rerun()
72
 
73
+ st.markdown('<p style="margin-top: -20px;">Extract structured text from images using advanced vision models!</p>', unsafe_allow_html=True)
74
  st.markdown("---")
75
 
76
+ # Add model selection
77
  with st.sidebar:
78
+ st.header("Settings")
79
+ model_option = st.selectbox(
80
+ "Select Vision Model",
81
+ ["LLaVA 1.5 (7B)", "CLIP-ViT", "BLIP-2"],
82
+ index=0
83
+ )
84
+
85
+ # Map selection to model ID
86
+ model_mapping = {
87
+ "LLaVA 1.5 (7B)": "llava-hf/llava-1.5-7b-hf",
88
+ "CLIP-ViT": "openai/clip-vit-base-patch32",
89
+ "BLIP-2": "Salesforce/blip2-opt-2.7b"
90
+ }
91
+
92
+ selected_model = model_mapping[model_option]
93
+
94
  st.header("Upload Image")
95
  uploaded_file = st.file_uploader("Choose an image...", type=['png', 'jpg', 'jpeg'])
96
 
 
104
  st.error("Hugging Face API key is missing. Please set it as an environment variable or in Streamlit secrets.")
105
  else:
106
  if st.button("Extract Text πŸ”", type="primary"):
107
+ with st.spinner(f"Processing image with {model_option}..."):
108
  try:
109
+ # Update the model URL
110
+ API_URL = f"https://api-inference.huggingface.co/models/{selected_model}"
111
+
112
  # Get image bytes
113
  img_bytes = uploaded_file.getvalue()
114
 
 
117
  st.session_state['ocr_result'] = result
118
  except Exception as e:
119
  st.error(f"Error processing image: {str(e)}")
120
+ st.info("Try selecting a different model from the dropdown.")
121
 
122
  # Main content area for results
123
  if 'ocr_result' in st.session_state:
 
127
 
128
  # Footer
129
  st.markdown("---")
130
+ st.markdown("Made with ❀️ using Hugging Face Vision Models | [Report an Issue](https://github.com/bulentsoykan/streamlit-OCR-app/issues)")