Spaces:
Running
Running
import gradio as gr | |
from sentence_transformers import SentenceTransformer | |
import duckdb | |
from huggingface_hub import get_token | |
model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m-v1.5") | |
def similarity_search( | |
query: str, | |
k: int = 5, | |
dataset_name: str = "smol-blueprint-project/hf-blogs-text-embeddings", | |
embedding_column: str = "embedding", | |
): | |
# Use same model as used for indexing | |
query_vector = model.encode(query) | |
embedding_dim = model.get_sentence_embedding_dimension() | |
sql = f""" | |
SELECT | |
title, | |
author, | |
date, | |
local, | |
tags, | |
URL, | |
chunk, | |
array_cosine_distance( | |
{embedding_column}::float[{embedding_dim}], | |
{query_vector.tolist()}::float[{embedding_dim}] | |
) as distance | |
FROM 'hf://datasets/{dataset_name}/**/*.parquet' | |
ORDER BY distance | |
LIMIT {k} | |
""" | |
return duckdb.sql(sql).to_df() | |
with gr.Blocks() as demo: | |
query = gr.Textbox(label="Query") | |
k = gr.Slider(1, 10, value=5, label="Number of results") | |
btn = gr.Button("Search") | |
results = gr.Dataframe(headers=["title", "url", "content", "distance"]) | |
btn.click(fn=similarity_search, inputs=[query, k], outputs=[results]) | |
demo.launch() |