Spaces:

NEXAS
/

ImageSearchClip

Sleeping

App Files Files Community

NEXAS commited on Jun 26

Commit

04792be

verified ·

1 Parent(s): 2fa941f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +89 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,91 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import os
+import fitz  # PyMuPDF
+import chromadb
+import tempfile
 import streamlit as st
+from PIL import Image
+from chromadb.utils.data_loaders import ImageLoader
+from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
+# Paths
+DB_PATH = './data/image_vdb'
+IMAGES_DIR = './data/extracted_images'
+os.makedirs(IMAGES_DIR, exist_ok=True)
+# Init Chroma
+chroma_client = chromadb.PersistentClient(path=DB_PATH)
+image_loader = ImageLoader()
+embedding_fn = OpenCLIPEmbeddingFunction()
+image_collection = chroma_client.get_or_create_collection(
+    name="image", embedding_function=embedding_fn, data_loader=image_loader
+)
+# Utilities
+def extract_images_from_pdf(pdf_bytes):
+    pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
+    saved_images = []
+    for page_num in range(len(pdf)):
+        page = pdf.load_page(page_num)
+        images = page.get_images(full=True)
+        for img_idx, img in enumerate(images):
+            xref = img[0]
+            base_image = pdf.extract_image(xref)
+            img_bytes = base_image["image"]
+            ext = base_image["ext"]
+            filename = f"page_{page_num+1}_img_{img_idx+1}.{ext}"
+            path = os.path.join(IMAGES_DIR, filename)
+            with open(path, "wb") as f:
+                f.write(img_bytes)
+            saved_images.append(path)
+    return saved_images
+def index_images_in_chroma(image_paths):
+    ids = []
+    uris = []
+    for i, path in enumerate(sorted(image_paths)):
+        if path.endswith((".png", ".jpeg", ".jpg")):
+            ids.append(f"img_{len(image_collection.get()['ids']) + i}")
+            uris.append(path)
+    if ids:
+        image_collection.add(ids=ids, uris=uris)
+def query_similar_images(image_file, top_k=5):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
+        tmp.write(image_file.read())
+        tmp_path = tmp.name
+    results = image_collection.query(query_uris=[tmp_path], n_results=top_k)
+    os.remove(tmp_path)
+    return results['uris'][0]
+# Streamlit UI
+st.title("🔍 Image Search from PDF (HR Tool Demo)")
+with st.expander("📤 Step 1: Upload PDF to Extract Images"):
+    uploaded_pdf = st.file_uploader("Upload a PDF file", type=["pdf"])
+    if uploaded_pdf is not None:
+        with st.spinner("Extracting images..."):
+            saved_images = extract_images_from_pdf(uploaded_pdf.read())
+            index_images_in_chroma(saved_images)
+            st.success(f"Extracted and indexed {len(saved_images)} images.")
+            st.image(saved_images, caption="Extracted images", width=150)
+st.divider()
+with st.expander("🖼️ Step 2: Search by Uploading a Query Image"):
+    query_img = st.file_uploader("Upload a query image", type=["jpg", "jpeg", "png"])
+    if query_img is not None:
+        st.image(query_img, caption="Query Image", width=200)
+        with st.spinner("Searching similar images..."):
+            results = query_similar_images(query_img, top_k=5)
+        st.subheader("🔎 Top Matches:")
+        for res_path in results:
+            st.image(res_path, width=200, caption=os.path.basename(res_path))