Spaces:
Sleeping
Sleeping
File size: 3,674 Bytes
fbfc05e 2079146 fbfc05e 2079146 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# -*- coding: utf-8 -*-
"""AdvancedRAG_CrossEncoder_Reranker_Zephyr7bAlpha_.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1cpw-00tHts6d-z-yRAwu8SDPD6calQvB
"""
pip install -q pypdf
pip install -q python-dotenv
pip install -q llama-index
pip install -q gradio
pip install einops
pip install accelerate
pip install sentence-transformers
pip install cohere
pip install --upgrade huggingface_hub
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
import torch
documents = SimpleDirectoryReader("/content/data").load_data()
#intialize our custom LLM
import torch
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf',
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
from llama_index.embeddings import HuggingFaceEmbedding
# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()
# loads BAAI/bge-small-en-v1.5
# intilaize our custom embeddings
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
service_context = ServiceContext.from_defaults(
chunk_size=512,
llm=llm,
embed_model=embed_model
)
"""Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy"""
from llama_index.indices.postprocessor import SentenceTransformerRerank
# Initialize the reranker
rerank = SentenceTransformerRerank(
model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks
#create query engine
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine() # Without reranker
def predict(input, history):
response = query_engine.query(input)
return str(response)
#create query engine with cross encoder reranker
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks.
def predict(input, history):
response = query_engine.query(input)
return str(response)
import time
import gradio as gr
def predict(input, history):
start_time = time.time() # Start the timer
response = query_engine.query(input) # Process the query
end_time = time.time() # Stop the timer
response_time = end_time - start_time # Calculate the time taken
# Format the response to include the time taken
timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)"
return str(timed_response)
# Launch gradio chat ui
gr.ChatInterface(predict).launch(share=True, debug=True) |