Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""AdvancedRAG_CrossEncoder_Reranker_Zephyr7bAlpha_.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1cpw-00tHts6d-z-yRAwu8SDPD6calQvB | |
""" | |
pip install -q pypdf | |
pip install -q python-dotenv | |
pip install -q llama-index | |
pip install -q gradio | |
pip install einops | |
pip install accelerate | |
pip install sentence-transformers | |
pip install cohere | |
pip install --upgrade huggingface_hub | |
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir | |
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext | |
from llama_index.llms import HuggingFaceLLM | |
import torch | |
documents = SimpleDirectoryReader("/content/data").load_data() | |
#intialize our custom LLM | |
import torch | |
from llama_index.llms import LlamaCPP | |
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt | |
llm = LlamaCPP( | |
# You can pass in the URL to a GGML model to download it automatically | |
model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf', | |
# optionally, you can set the path to a pre-downloaded model instead of model_url | |
model_path=None, | |
temperature=0.1, | |
max_new_tokens=256, | |
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
context_window=3900, | |
# kwargs to pass to __call__() | |
generate_kwargs={}, | |
# kwargs to pass to __init__() | |
# set to at least 1 to use GPU | |
model_kwargs={"n_gpu_layers": -1}, | |
# transform inputs into Llama2 format | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt, | |
verbose=True, | |
) | |
from llama_index.embeddings import HuggingFaceEmbedding | |
# loads BAAI/bge-small-en | |
# embed_model = HuggingFaceEmbedding() | |
# loads BAAI/bge-small-en-v1.5 | |
# intilaize our custom embeddings | |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
service_context = ServiceContext.from_defaults( | |
chunk_size=512, | |
llm=llm, | |
embed_model=embed_model | |
) | |
"""Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy""" | |
from llama_index.indices.postprocessor import SentenceTransformerRerank | |
# Initialize the reranker | |
rerank = SentenceTransformerRerank( | |
model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks | |
#create query engine | |
index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
query_engine = index.as_query_engine() # Without reranker | |
def predict(input, history): | |
response = query_engine.query(input) | |
return str(response) | |
#create query engine with cross encoder reranker | |
index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks. | |
def predict(input, history): | |
response = query_engine.query(input) | |
return str(response) | |
import time | |
import gradio as gr | |
def predict(input, history): | |
start_time = time.time() # Start the timer | |
response = query_engine.query(input) # Process the query | |
end_time = time.time() # Stop the timer | |
response_time = end_time - start_time # Calculate the time taken | |
# Format the response to include the time taken | |
timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)" | |
return str(timed_response) | |
# Launch gradio chat ui | |
gr.ChatInterface(predict).launch(share=True, debug=True) |