# -*- coding: utf-8 -*-
"""AmiteshKumarDwivedi_IR_Project.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1Y99JVHjS_jukrw_27BTOe1TAYUquFRSO

# **What is RAG?**
RAG stands for Retrieval Augmented Generation. It can simply be broken down in these three steps:


*  **Retrieval** - Seeking relevant information from a source given query. e.g "*What is a term incident matrix?*" -> Retrieves passages related to term incident matrix
*  **Augmented** - Using relevant retrieved information to modify input prompt to LLM with relevant knowledge base.
*   **Generation** - Using first two steps to generate output for a given input

Goal is to retrieve information to pass it to a large language model to generate output based on the knowledge provided.

**Why use RAG?**

Main goal is to improve the quality of the generated output

1.   **Improve Hallucination** - LLMs are prone to hallucination(generating someting that looks correct but isn't). RAG pipelines can help LLMs generate more fact/knowledge based output through providing fact based input(like a textbook). Furthermore, even if we doubt the answer from a RAG pipeline, we always know what source we can refer to
2.   **Work with custom data** - LLMs are trained with text data - they have great ability to model a language but lack knowledge. RAG can provide domain specific knowledge to suite user information need and use cases. e.g Looking up for niche information from niche textbooks.
"""

import os
# if "COLAB_GPU" in os.environ:
#     print("Installing requirements.")
#     !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
#     !pip install PyMuPDF # for reading PDFs with Python
#     !pip install tqdm # for progress bars
#     !pip install sentence-transformers # for embedding models
#     !pip install accelerate # for quantization model loading
#     !pip install bitsandbytes # for quantizing models (less storage space)
#     !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference
#     !pip install nltk
#     !pip install spacy

"""# Some common terms to know before we proceed

Term | Description
-----|-------------
Token | The smallest meaningful unit of text that a computer can understand. For example, the sentence "hello, world!" could be broken down into the tokens "hello", ",", "world", and "!". A token can be a whole word, part of a word, or a group of punctuation marks. On average, 1 token is roughly equal to 4 characters in English, and 100 tokens is about 75 words. Before an LLM can process text, it needs to be broken down into tokens.
Embedding | A way of representing a piece of data (like a sentence or paragraph of text) as a list of numbers. Similar pieces of data (like sentences with similar meanings) will have similar numerical representations or "embeddings". An embedding for a sentence might be a list of 768 numbers, for example.
Embedding model | A type of computer program that takes in data (like text) and outputs a numerical representation or "embedding" of that data. For example, an embedding model might take in 384 tokens of text and convert it into a list of 768 numbers.
Similarity search/Vector search | A technique for finding data points (like text embeddings) that are "close" or similar to each other in a high-dimensional space. Text about similar topics should have embeddings with high similarity scores, while text on different topics should have lower scores. Common ways to measure similarity include dot product and cosine similarity.
Large Language Model (LLM) | A very large computer program that has learned patterns from a vast amount of text data. When given a piece of text, a generative LLM can continue the text in a way that seems natural and coherent based on the patterns it has learned. For example, if given "hello, world!", it might generate "we're going to build a program today!".
LLM context window | The amount of input data (measured in tokens) that an LLM can process at once. Larger LLMs can handle longer context windows. For example, as of March 2024, GPT-4 could process up to 128,000 tokens (about 384 pages) at once.
Prompt | The input data that is provided to an LLM to generate an output. The way the prompt is structured and framed can greatly influence the LLM's generated text. The technique of carefully designing prompts is called "prompt engineering".

# Document Processing and Creation of Embeddings

## What we need:

*   Information Retrieval Textbook
*   Embedding Model of choice

## Steps:

1.   Import Information Retrieval Textbook - Online or Offline
2.   Process textbook for embedding - splitting into chunks of sentences
3.   Embed text chunks with an embedding model
4.   Save embeddings to a file for later

Importing PDFs and Opening PDFs
"""

import requests

pdf_path = "ir_book.pdf"
if not os.path.exists(pdf_path):
  print("File not available, let me download from the internet")

  #Enter URL to download

  url = "https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf"

  #local filename to save downloaded file
  filename = pdf_path
  download_pdf(url, filename)

  #Sending a GET request to URL
  response = request.get(url)

  #checking if req was success or fail
  if response.status_code == 200:
    #open file and save
    with open(filename, "wb") as file:
      file.write(response.content)
      print(f"The file has been downloaded and saved as {filename}")
  else:
    print(f"Failed to download the file. Status Code: {response.status_code}")

else:

    print(f"File exists")

import fitz  # PyMuPDF
from tqdm import tqdm
import re
import spacy
import random

nlp = spacy.load("en_core_web_sm")

def textFormat(text: str) -> str:
    # patterns_to_remove = [
    #     r'Online edition c\\n2009 Cambridge UP',
    #     r'An\\nIntroduction\\nto\\nInformation\\nRetrieval',
    #     r'Draft of April 1,? 2009'
    #     r'Online edition (c)\n2009 Cambridge UP\nAn\nIntroduction\nto\nInformation\nRetrieval\nDraft of April 1, 2009\n'
    # ]

    # for pattern in patterns_to_remove:
    #     clean_text = re.sub(pattern, '', text)


    clean_text = text.replace('/n', '')  # removing new lines and whitespaces
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    clean_text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces into one
    clean_text.strip()
    return clean_text

def open_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    page_n_text = []

    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = textFormat(text=text)
        page_n_text.append({
            "page_number": page_number - 25,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 7,  # 1 token in mostly 4 characters in english - #100 tokens are approx 75 words. I changed this from 4-6 after EDA because of choice of embedding models used
            "text": text
        })

    return page_n_text

page_n_text = open_read_pdf(pdf_path=pdf_path)
random.sample(page_n_text, k=3)

#EDA

import pandas as pd

df = pd.DataFrame(page_n_text)
df.head()

df.describe().round(3)

"""Token count is important, since
1.   embedding models & LLM dont deal with endless tokens

# Further Text Processing

I will be breaking the text into chunks of sentences.

Workflow
```
Ingest Text -> split it in groups -> make embeddings -> use embeddings
```

Two ways of doing this:
1. Split on "."
2. Using spacy and nltk(already installed here)
"""

import spacy
from spacy.lang.en import English

nlp = English()

#add a sentencizer pipeline
#sentencizer - turning text into sentences

nlp.add_pipe("sentencizer")

for item in tqdm(page_n_text):
  item["sentences"] = list(nlp(item["text"]).sents)

  item["sentences"] = [str(sentence) for sentence in item["sentences"]]#making sure all sentences are string

    # Count the sentences
  item["page_sentence_count_spacy"] = len(item["sentences"])

random.sample(page_n_text, k =2)

import pandas as pd

df = pd.DataFrame(page_n_text)
df.describe().round(2)

"""### Chunking our sentences together

Concept of splitting large text into smaller texts is referred to as chunking. I will be splitting in groups of 10 sentences
"""

#define split size to turn groups of sentences to chukns

num_sentence_chunk_size = 10

#splitting text recursively into chunk size. example list of 20 will go to 2 list of 10
def split_list(input_list: list,
               slice_size: int= num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

#Loop thru pages and split sentences into chunks

for item in tqdm(page_n_text):
  item["sentence_chunks"] = split_list(input_list = item["sentences"], slice_size=num_sentence_chunk_size)
  item["num_chunks"] = len(item["sentence_chunks"])

random.sample(page_n_text, k=2)

df = pd.DataFrame(page_n_text)
df.describe().round(2)

"""Splitting each chunk into their own item(numerical representation) to attach metadata and do other operations

"""

page_n_chunk = []
for item in tqdm(page_n_text):
    for sentence_chunks in item["sentence_chunks"]:
        chunk_dic = {}
        chunk_dic["page_number"] = item["page_number"]

        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)

        joined_sentence_chunk = "".join(sentence_chunks).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
        chunk_dic["sentence_chunk"] = joined_sentence_chunk

        #Getting stats on chunks
        chunk_dic["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dic["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dic["chunk_token_count"] = len(joined_sentence_chunk) / 4          #As done above in pre-processing
        page_n_chunk.append(chunk_dic)

# How many chunks do we have?
len(page_n_chunk)

random.sample(page_n_chunk, k=3)

df = pd.DataFrame(page_n_chunk)
df.describe().round(2)

#Show random chunks with less than 0 filter as they might not be very useful.

min_token_lenghth = 20
for row in df[df["chunk_token_count"] <= min_token_lenghth].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

#Filter less than 20 lengths
pages_n_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_lenghth].to_dict(orient="records")
pages_n_chunks_over_min_token_len[:10]

random.sample(pages_n_chunks_over_min_token_len, k=2)

"""Now we create embeddings for our text chunks. Embeddings are very powerful concept as machines understand numbers more than free-language.

Embeddings are useful numerical representation of text data. They are a learned representation. Vicky Boykus has a great blog on this which I referred to
"""

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")

sentences = [
"RAG combines the power of retrieval and generation to enhance the quality of AI-generated text.",
"It retrieves relevant documents to provide context, which is then used by a generator to produce coherent and informed responses.",
"This method leverages both neural retrieval and transformer-based generative models, merging the best of both worlds.",
"Using RAG can significantly improve the informativeness and accuracy of responses in natural language processing tasks."
]

# Sentences are encoded/embedded by calling model.encode()
embeddings = embedding_model.encode(sentences)
embeddings_dict = dict(zip(sentences, embeddings))

# See the embeddings
for sentence, embedding in embeddings_dict.items():
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

embeddings[0].shape

# Commented out IPython magic to ensure Python compatibility.
# %%time
# #2.5 mins for CPU and 20.8s in CUDA/GPU
# # Uncomment to see how long it takes to create embeddings on CPU
# # Make sure the model is on the CPU
# embedding_model.to("cuda")
# 
# # Embed each chunk one by one
# for item in tqdm(pages_n_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

#Saving Embeddings into file
text_chunks_and_embeddings_df = pd.DataFrame(pages_n_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

#Importing saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

"""Now I will be using these embeddings to retrieve relevant passages based on a query and use relevant passages to augment input to LLM so it generates an output on those relevant passages."""

import random
import torch
import numpy as np
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

# importing txts and embedding df
text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

# since csv was string, converting embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# converting texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# convert embeddings to torch tensor and send to device (!!!note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

"""Now I will create a small semantic search pipelines - I will want to search for a query ex. Vector Databases and get relevant passages from textbook

Steps:
1. Define query string
2. Turning query string into embeddings
3. Perform cosine similarity/dot product
4. Rank in decreasing order
"""

from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device) # choose the device to load the model to

query = "vector databases"
print(f"Query: {query}")

#Embedding the query with same embedding model

query_embedding = embedding_model.encode(query, convert_to_tensor =True)

#Use Cosine Similarity and Dot Product
dot_score = util.dot_score(a=query_embedding, b=embeddings)[0]

top_res_dot_product = torch.topk(dot_score, k=5)
top_res_dot_product

# defining helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_res_dot_product[0], top_res_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

import fitz

# Open PDF and load target page
pdf_path = "ir_book.pdf" # requires PDF to be downloaded
doc = fitz.open(pdf_path)
page = doc.load_page(5 + 25) # number of page (our doc starts page numbers on page 41)

# Get the image of the page
img = page.get_pixmap(dpi=300)

# Optional: save the image
#img.save("output_filename.png")
doc.close()

# Convert the Pixmap to a numpy array
img_array = np.frombuffer(img.samples_mv,
                          dtype=np.uint8).reshape((img.h, img.w, img.n))

# Display the image using Matplotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis('off') # Turn off axis
plt.show()

"""Now I'll be exploring both dot-product and cosine similarity metrics

"""

import torch

def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector (removes the magnitude, keeps direction)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))

    return dot_product / (norm_vector1 * norm_vector2)

# Example tensors
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)

# Calculate dot product
print("Dot product between vector1 and vector2:", dot_product(vector1, vector2))
print("Dot product between vector1 and vector3:", dot_product(vector1, vector3))
print("Dot product between vector1 and vector4:", dot_product(vector1, vector4))

# Calculate cosine similarity
print("Cosine similarity between vector1 and vector2:", cosine_similarity(vector1, vector2))
print("Cosine similarity between vector1 and vector3:", cosine_similarity(vector1, vector3))
print("Cosine similarity between vector1 and vector4:", cosine_similarity(vector1, vector4))

"""Now I'll be functionizing my semantic search pipeline"""

from timeit import default_timer as timer

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query,
                                   convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

#testing
query = "what is a vector database"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

print_top_results_and_scores(query=query,
                             embeddings=embeddings)

import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

# !pip install huggingface_hub

# !pip install --upgrade transformers

"""Loading the LLM Locally"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available

from huggingface_hub import notebook_login

notebook_login() #Use token 


# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
model_id = "google/gemma-7b-it"  # Ensure this model ID is accessible or replace with an accessible model ID
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
use_quantization_config = True  # Set this to True if you want to use quantization, otherwise set it to False
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16,  # Datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,  # Use full memory
                                                 attn_implementation=attn_implementation)  # Which attention version to use

if not use_quantization_config:  # Quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

"""Generating Texts.

The tokenized input comes after I pass a string of text to the tokenizer.
"""

input_text = "What is a Vector Database?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

# Commented out IPython magic to ensure Python compatibility.
# %%time
# 
# # Tokenize the input text (turn it into numbers) and send it to GPU
# input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
# print(f"Model input (tokenized):\n{input_ids}\n")
# 
# # Generate outputs passed on the tokenized input
# # See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig
# outputs = llm_model.generate(**input_ids,
#                              max_new_tokens=256) # define the maximum number of new tokens to create
# print(f"Model output (tokens):\n{outputs[0]}\n")

"""decoding output now

"""

# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

""" formatting to replace the prompt in the output text

"""

print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

"""Now I will try to use some questions from ChatGPT 3.5, and some of my own questions to out"""

chatgpt_questions = [
    "How does the textbook describe the process of constructing an inverted index?",
    "What are the key differences between Boolean retrieval and vector space models?",
    "What does the textbook say about the evaluation of information retrieval systems?"
]

manual_questions = [
    "What is Boolean IR?",
    "What are the types of term weighing schemes",
]


query_list = chatgpt_questions + manual_questions

#Checking if retrieve_relevant_resources is working with my queries

import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

"""Now I will be focusing on Augmenting. In augmentation, we take results from our search for relevant resources and insert them into the prompt we give to the LLM.

We start with a base prompt and update it after we get the retrieved text as context text.
"""

def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
Extract relevant passages from the context before answering the query, but do not return the extraction process in your response.
Ensure that the answers are explanatory, leveraging the technical details and examples provided in the textbook as needed.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: How does an inverted index support efficient query processing?
Answer: An inverted index enhances query processing efficiency by mapping each term found in documents to its corresponding list of documents, thus avoiding a linear scan of all documents. This is achieved by maintaining a dictionary where each term is linked to a postings list that records all documents containing the term. To answer a query, the system retrieves the postings lists for the query terms and intersects them, which is efficient due to the sorted nature of these lists. This method significantly reduces the time required to find documents that meet the query criteria compared to scanning each document sequentially.

Example 2:
Query: What role does tokenization play in text preprocessing for information retrieval?
Answer: Tokenization is a crucial step in text preprocessing for information retrieval, where it involves breaking down text into smaller pieces or tokens. This process is fundamental because it determines the granularity at which information is indexed and retrieved. Proper tokenization helps in identifying meaningful elements in the text, such as words or phrases, that are used to build the index. Effective tokenization directly impacts the retrieval effectiveness, as it influences both the construction of the inverted index and the accuracy of the response to user queries.

Example 3:
Query: What are the advantages of using vector space models for information retrieval?
Answer: Vector space models offer significant advantages for information retrieval by allowing the ranking of documents based on their relevance to a query, unlike Boolean models that provide binary results. This model represents both documents and queries as vectors in a multi-dimensional space where each dimension corresponds to a separate term. Relevance is calculated based on the cosine similarity between these vectors, enabling a more nuanced identification of documents that are most likely to satisfy the user's information need. This method facilitates effective retrieval by accommodating partial matching and ranking documents based on their query relevance score.

Now use the following context items from your textbook to answer the user query:
{context}

Relevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""


    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

#Trying out above function

query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

"""Tokenizing the above and passing it to our LLM now"""

# Commented out IPython magic to ensure Python compatibility.
# %%time
# 
# input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
# 
# # Generate an output of tokens
# outputs = llm_model.generate(**input_ids,
#                              temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
#                              do_sample=True, # whether or not to use sampling, referenced from https://huyenchip.com/2024/01/16/sampling.html f
#                              max_new_tokens=256) # how many new tokens to generate from prompt
# 
# # Turn the output tokens into text
# output_text = tokenizer.decode(outputs[0])
# 
# print(f"Query: {query}")
# print(f"RAG answer:\n{output_text.replace(prompt, '')}")

"""Functionizing generation step to make it easier. Also formatting output text to make it easier to read and enabling option to return context items"""

def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

"""Trying out the above"""

query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items

# !pip install gradio

import gradio as gr

# Function to integrate with Gradio
def rag_chatbot(query):
    answer_text = ask(query, return_answer_only=True)  # This ensures we only get the answer
    return answer_text

# Gradio interface setup
interface = gr.Interface(
    fn=rag_chatbot,
    inputs="text",
    outputs="text",
    title="RAG Chatbot",
    description="This is a Retrieval-Augmented Generation (RAG) Chatbot for Information Retrieval Textbook queries."
)

# Launch the Gradio app
# interface.launch(share = True)

# !git clone git@hf.co:spaces/AmiDwivedi/IR_Project

# !gradio deploy