NEXAS commited on
Commit
04792be
Β·
verified Β·
1 Parent(s): 2fa941f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +89 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,91 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
 
4
  import streamlit as st
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ import chromadb
4
+ import tempfile
5
  import streamlit as st
6
+ from PIL import Image
7
+ from chromadb.utils.data_loaders import ImageLoader
8
+ from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
9
 
10
+ # Paths
11
+ DB_PATH = './data/image_vdb'
12
+ IMAGES_DIR = './data/extracted_images'
13
+ os.makedirs(IMAGES_DIR, exist_ok=True)
14
+
15
+ # Init Chroma
16
+ chroma_client = chromadb.PersistentClient(path=DB_PATH)
17
+ image_loader = ImageLoader()
18
+ embedding_fn = OpenCLIPEmbeddingFunction()
19
+ image_collection = chroma_client.get_or_create_collection(
20
+ name="image", embedding_function=embedding_fn, data_loader=image_loader
21
+ )
22
+
23
+ # Utilities
24
+ def extract_images_from_pdf(pdf_bytes):
25
+ pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
26
+ saved_images = []
27
+
28
+ for page_num in range(len(pdf)):
29
+ page = pdf.load_page(page_num)
30
+ images = page.get_images(full=True)
31
+
32
+ for img_idx, img in enumerate(images):
33
+ xref = img[0]
34
+ base_image = pdf.extract_image(xref)
35
+ img_bytes = base_image["image"]
36
+ ext = base_image["ext"]
37
+ filename = f"page_{page_num+1}_img_{img_idx+1}.{ext}"
38
+ path = os.path.join(IMAGES_DIR, filename)
39
+
40
+ with open(path, "wb") as f:
41
+ f.write(img_bytes)
42
+
43
+ saved_images.append(path)
44
+
45
+ return saved_images
46
+
47
+ def index_images_in_chroma(image_paths):
48
+ ids = []
49
+ uris = []
50
+
51
+ for i, path in enumerate(sorted(image_paths)):
52
+ if path.endswith((".png", ".jpeg", ".jpg")):
53
+ ids.append(f"img_{len(image_collection.get()['ids']) + i}")
54
+ uris.append(path)
55
+
56
+ if ids:
57
+ image_collection.add(ids=ids, uris=uris)
58
+
59
+ def query_similar_images(image_file, top_k=5):
60
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp:
61
+ tmp.write(image_file.read())
62
+ tmp_path = tmp.name
63
+
64
+ results = image_collection.query(query_uris=[tmp_path], n_results=top_k)
65
+ os.remove(tmp_path)
66
+ return results['uris'][0]
67
+
68
+ # Streamlit UI
69
+ st.title("πŸ” Image Search from PDF (HR Tool Demo)")
70
+
71
+ with st.expander("πŸ“€ Step 1: Upload PDF to Extract Images"):
72
+ uploaded_pdf = st.file_uploader("Upload a PDF file", type=["pdf"])
73
+ if uploaded_pdf is not None:
74
+ with st.spinner("Extracting images..."):
75
+ saved_images = extract_images_from_pdf(uploaded_pdf.read())
76
+ index_images_in_chroma(saved_images)
77
+ st.success(f"Extracted and indexed {len(saved_images)} images.")
78
+ st.image(saved_images, caption="Extracted images", width=150)
79
+
80
+ st.divider()
81
+
82
+ with st.expander("πŸ–ΌοΈ Step 2: Search by Uploading a Query Image"):
83
+ query_img = st.file_uploader("Upload a query image", type=["jpg", "jpeg", "png"])
84
+ if query_img is not None:
85
+ st.image(query_img, caption="Query Image", width=200)
86
+ with st.spinner("Searching similar images..."):
87
+ results = query_similar_images(query_img, top_k=5)
88
+
89
+ st.subheader("πŸ”Ž Top Matches:")
90
+ for res_path in results:
91
+ st.image(res_path, width=200, caption=os.path.basename(res_path))