Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import get_token
|
|
7 |
|
8 |
static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
|
9 |
model = SentenceTransformer(modules=[static_embedding])
|
|
|
10 |
dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
|
11 |
embedding_column = "embeddings"
|
12 |
table_name = "fineweb"
|
@@ -15,7 +16,7 @@ duckdb.sql(query=f"""
|
|
15 |
INSTALL vss;
|
16 |
LOAD vss;
|
17 |
CREATE TABLE {table_name} AS
|
18 |
-
SELECT *, {embedding_column}::float[{
|
19 |
FROM 'hf://datasets/{dataset_name}/**/*.parquet';
|
20 |
CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
|
21 |
""")
|
@@ -24,7 +25,7 @@ def similarity_search(query: str, k: int = 5):
|
|
24 |
embedding = model.encode(query).tolist()
|
25 |
return duckdb.sql(
|
26 |
query=f"""
|
27 |
-
SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{
|
28 |
FROM {table_name}
|
29 |
ORDER BY distance
|
30 |
LIMIT {k};
|
@@ -34,7 +35,7 @@ def similarity_search(query: str, k: int = 5):
|
|
34 |
with gr.Blocks() as demo:
|
35 |
gr.Markdown("""# RAG - retrieve
|
36 |
|
37 |
-
Part of [AI blueprint](https://github.com/
|
38 |
query = gr.Textbox(label="Query")
|
39 |
k = gr.Slider(1, 50, value=5, label="Number of results")
|
40 |
btn = gr.Button("Search")
|
|
|
7 |
|
8 |
static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
|
9 |
model = SentenceTransformer(modules=[static_embedding])
|
10 |
+
embedding_dimensions = model.get_sentence_embedding_dimension()
|
11 |
dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
|
12 |
embedding_column = "embeddings"
|
13 |
table_name = "fineweb"
|
|
|
16 |
INSTALL vss;
|
17 |
LOAD vss;
|
18 |
CREATE TABLE {table_name} AS
|
19 |
+
SELECT *, {embedding_column}::float[{embedding_dimensions}] as {embedding_column}_float
|
20 |
FROM 'hf://datasets/{dataset_name}/**/*.parquet';
|
21 |
CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
|
22 |
""")
|
|
|
25 |
embedding = model.encode(query).tolist()
|
26 |
return duckdb.sql(
|
27 |
query=f"""
|
28 |
+
SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{embedding_dimensions}]) as distance
|
29 |
FROM {table_name}
|
30 |
ORDER BY distance
|
31 |
LIMIT {k};
|
|
|
35 |
with gr.Blocks() as demo:
|
36 |
gr.Markdown("""# RAG - retrieve
|
37 |
|
38 |
+
Part of [AI blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
|
39 |
query = gr.Textbox(label="Query")
|
40 |
k = gr.Slider(1, 50, value=5, label="Number of results")
|
41 |
btn = gr.Button("Search")
|