davidberenstein1957 commited on
Commit
90b7917
·
verified ·
1 Parent(s): b82c93a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import get_token
7
 
8
  static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
9
  model = SentenceTransformer(modules=[static_embedding])
 
10
  dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
11
  embedding_column = "embeddings"
12
  table_name = "fineweb"
@@ -15,7 +16,7 @@ duckdb.sql(query=f"""
15
  INSTALL vss;
16
  LOAD vss;
17
  CREATE TABLE {table_name} AS
18
- SELECT *, {embedding_column}::float[{model.get_sentence_embedding_dimension()}] as embedding_float
19
  FROM 'hf://datasets/{dataset_name}/**/*.parquet';
20
  CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
21
  """)
@@ -24,7 +25,7 @@ def similarity_search(query: str, k: int = 5):
24
  embedding = model.encode(query).tolist()
25
  return duckdb.sql(
26
  query=f"""
27
- SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{model.get_sentence_embedding_dimension()}]) as distance
28
  FROM {table_name}
29
  ORDER BY distance
30
  LIMIT {k};
@@ -34,7 +35,7 @@ def similarity_search(query: str, k: int = 5):
34
  with gr.Blocks() as demo:
35
  gr.Markdown("""# RAG - retrieve
36
 
37
- Part of [AI blueprint](https://github.com/davidberenstein1957/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
38
  query = gr.Textbox(label="Query")
39
  k = gr.Slider(1, 50, value=5, label="Number of results")
40
  btn = gr.Button("Search")
 
7
 
8
  static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
9
  model = SentenceTransformer(modules=[static_embedding])
10
+ embedding_dimensions = model.get_sentence_embedding_dimension()
11
  dataset_name = "ai-blueprint/fineweb-bbc-news-embeddings"
12
  embedding_column = "embeddings"
13
  table_name = "fineweb"
 
16
  INSTALL vss;
17
  LOAD vss;
18
  CREATE TABLE {table_name} AS
19
+ SELECT *, {embedding_column}::float[{embedding_dimensions}] as {embedding_column}_float
20
  FROM 'hf://datasets/{dataset_name}/**/*.parquet';
21
  CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
22
  """)
 
25
  embedding = model.encode(query).tolist()
26
  return duckdb.sql(
27
  query=f"""
28
+ SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{embedding_dimensions}]) as distance
29
  FROM {table_name}
30
  ORDER BY distance
31
  LIMIT {k};
 
35
  with gr.Blocks() as demo:
36
  gr.Markdown("""# RAG - retrieve
37
 
38
+ Part of [AI blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
39
  query = gr.Textbox(label="Query")
40
  k = gr.Slider(1, 50, value=5, label="Number of results")
41
  btn = gr.Button("Search")