Spaces:

not-lain
/

RAG-Chatbot

Running on Zero

App Files Files Community

not-lain commited on Apr 5, 2024

Commit

1b7e4b0

1 Parent(s): 3ed215d

🌘w🌒

Browse files

Files changed (2) hide show

app.py +90 -103
requirements.txt +4 -4

app.py CHANGED Viewed

@@ -1,106 +1,90 @@
 import gradio as gr
-from datasets import load_dataset
-from sentence_transformers import SentenceTransformer
-from sentence_transformers.quantization import quantize_embeddings
-import faiss
-from usearch.index import Index
 import os
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
-from huggingface_hub import hf_hub_download
 token = os.environ["HF_TOKEN"]
-model = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it",
-                                             # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                                             torch_dtype=torch.float16,
-                                             token=token)
-tok = AutoTokenizer.from_pretrained("google/gemma-7b-it",token=token)
-device = torch.device('cuda')
 model = model.to(device)
-# Load titles and texts
-title_text_dataset = load_dataset(
-    "mixedbread-ai/wikipedia-data-en-2023-11", split="train", num_proc=4
-).select_columns(["title", "text"])
-# Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.
-path_int8_view = hf_hub_download(repo_id="sentence-transformers/quantized-retrieval",repo_type="space", filename="wikipedia_ubinary_faiss_1m.index")
-int8_view = Index.restore(path_int8_view, view=True)
-path_binary_index = hf_hub_download(repo_id="sentence-transformers/quantized-retrieval",repo_type="space", filename="wikipedia_ubinary_faiss_1m.index")
-binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary(
-    path_binary_index
-)
-# Load the SentenceTransformer model for embedding the queries
-model = SentenceTransformer(
-    "mixedbread-ai/mxbai-embed-large-v1",
-    prompts={
-        "retrieval": "Represent this sentence for searching relevant passages: ",
-    },
-    default_prompt_name="retrieval",
 )
-def search(
-    query, top_k: int = 10, rescore_multiplier: int = 1, use_approx: bool = False
-):
-    # 1. Embed the query as float32
-    query_embedding = model.encode(query)
-    # 2. Quantize the query to ubinary
-    query_embedding_ubinary = quantize_embeddings(
-        query_embedding.reshape(1, -1), "ubinary"
     )
-    # 3. Search the binary index (either exact or approximate)
-    index = binary_index
-    _scores, binary_ids = index.search(
-        query_embedding_ubinary, top_k * rescore_multiplier
-    )
-    binary_ids = binary_ids[0]
-    # 4. Load the corresponding int8 embeddings
-    int8_embeddings = int8_view[binary_ids].astype(int)
-    # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
-    scores = query_embedding @ int8_embeddings.T
-    # 6. Sort the scores and return the top_k
-    indices = scores.argsort()[::-1][:top_k]
-    top_k_indices = binary_ids[indices]
-    top_k_scores = scores[indices]
-    top_k_titles, top_k_texts = zip(
-        *[
-            (title_text_dataset[idx]["title"], title_text_dataset[idx]["text"])
-            for idx in top_k_indices.tolist()
-        ]
-    )
-    df = {
-            "Score": [round(value, 2) for value in top_k_scores],
-            "Title": top_k_titles,
-            "Text": top_k_texts,
-        }
-    return df
-def prepare_prompt(query, df):
-    prompt = f"Query: {query}\nContinue to answer the query by using the Search Results:\n"
-    for data in df :
-        title = data["Title"]
-        text = data["Text"]
-        prompt+=f"Title: {title}, Text: {text}\n"
-    return prompt
 @spaces.GPU
 def talk(message, history):
-    df = search(message)
-    message = prepare_prompt(message,df)
     resources = "\nRESOURCES:\n"
-    for title in df["Title"][:3] :
-        resources+=f"[{title}](https://huggingface.co/spaces/not-lain/RAG), "
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
@@ -112,7 +96,8 @@ def talk(message, history):
     # Tokenize the messages string
     model_inputs = tok([messages], return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(
-        tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         model_inputs,
         streamer=streamer,
@@ -131,33 +116,35 @@ def talk(message, history):
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
-    partial_text+= resources
     yield partial_text
 TITLE = "RAG"
 DESCRIPTION = """
 ## Resources used to build this project
-* https://huggingface.co/learn/cookbook/rag_with_hugging_face_gemma_mongodb
-* https://huggingface.co/spaces/sentence-transformers/quantized-retrieval
-## Retrival paramaters
-```python
-top_k: int = 10, rescore_multiplier: int = 1, use_approx: bool = False
-```
 ## Models
 the models used in this space are :
 * google/gemma-7b-it
-* mixedbread-ai/wikipedia-data-en-2023-11
 """
-demo = gr.ChatInterface(fn=talk,
-                        chatbot=gr.Chatbot(show_label=True, show_share_button=True, show_copy_button=True, likeable=True, layout="bubble", bubble_full_width=False),
-                        theme="Soft",
-                        examples=[["what is machine learning"]],
-                        title=TITLE,
-                        description=DESCRIPTION)
 demo.launch()

 import gradio as gr
+from datasets import load_dataset, Dataset
+# import faiss
 import os
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
+from ragatouille import RAGPretrainedModel
+from datasets import load_dataset
 token = os.environ["HF_TOKEN"]
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-7b-it",
+    # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    torch_dtype=torch.float16,
+    token=token,
+)
+tok = AutoTokenizer.from_pretrained("google/gemma-7b-it", token=token)
+device = torch.device("cuda")
 model = model.to(device)
+RAG = RAGPretrainedModel.from_pretrained("mixedbread-ai/mxbai-colbert-v1")
+# prepare data
+# since data is too big we will only select the first 3K lines
+dataset = load_dataset(
+    "wikimedia/wikipedia", "20231101.en", split="train", streaming=True
 )
+# init data
+data = Dataset.from_dict({})
+i = 0
+for i, entry in enumerate(dataset):
+    # each entry has the following columns
+    # ['id', 'url', 'title', 'text']
+    data.add_item(entry)
+    if i == 3000:
+        break
+# free memory
+del dataset  # we keep data
+# index data
+documents = data["text"]
+RAG.index(documents, index_name="wikipedia", use_faiss=True)
+# free memory
+del documents
+def search(query, k: int = 5):
+    results = RAG.search(query, k=k)
+    # results are ordered according to their score
+    # results has the following keys
+    #
+    # {'content' : 'retrieved content'
+    # 'score' : score[float]
+    # 'rank' : "results are sorted using score and each is given a rank, also can be called place, 1 2 3 4 ..."
+    # 'document_id' : "no clue man i just got here"
+    # 'passage_id' :  "or original row number"
+    # }
+    #
+    return [result["passage_id"] for result in results]
+def prepare_prompt(query, indexes,data = data):
+    prompt = (
+        f"Query: {query}\nContinue to answer the query by using the Search Results:\n"
     )
+    titles = []
+    urls = []
+    for i in indexes:
+        title = entry["title"][i]
+        text = entry["text"][i]
+        url = entry["url"][i]
+        titles.append(title)
+        urls.append(url)
+        prompt += f"Title: {title}, Text: {text}\n"
+    return prompt, (titles,urls)
 @spaces.GPU
 def talk(message, history):
+    indexes = search(message)
+    message,metadata = prepare_prompt(message, indexes)
     resources = "\nRESOURCES:\n"
+    for title,url in metadata:
+        resources += f"[{title}]({url}),  "
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
     # Tokenize the messages string
     model_inputs = tok([messages], return_tensors="pt").to(device)
     streamer = TextIteratorStreamer(
+        tok, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+    )
     generate_kwargs = dict(
         model_inputs,
         streamer=streamer,
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
+    partial_text += resources
     yield partial_text
 TITLE = "RAG"
 DESCRIPTION = """
 ## Resources used to build this project
+* https://huggingface.co/mixedbread-ai/mxbai-colbert-large-v1
+* me 😎
 ## Models
 the models used in this space are :
 * google/gemma-7b-it
+* mixedbread-ai/mxbai-colbert-v1
 """
+demo = gr.ChatInterface(
+    fn=talk,
+    chatbot=gr.Chatbot(
+        show_label=True,
+        show_share_button=True,
+        show_copy_button=True,
+        likeable=True,
+        layout="bubble",
+        bubble_full_width=False,
+    ),
+    theme="Soft",
+    examples=[["what is machine learning"]],
+    title=TITLE,
+    description=DESCRIPTION,
+)
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 spaces
 torch==2.2.0
-git+https://github.com/huggingface/transformers/
-git+https://github.com/tomaarsen/sentence-transformers@feat/quantization
-usearch
-faiss-cpu

 spaces
 torch==2.2.0
+transformers
+faiss-gpu
+ragatouille
+datasets