File size: 3,674 Bytes
fbfc05e
 
 
 
 
 
 
 
 
2079146
 
 
 
 
 
 
 
 
 
 
fbfc05e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2079146
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""AdvancedRAG_CrossEncoder_Reranker_Zephyr7bAlpha_.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1cpw-00tHts6d-z-yRAwu8SDPD6calQvB
"""

pip install -q pypdf
pip install -q python-dotenv
pip install -q llama-index
pip install -q gradio
pip install einops
pip install accelerate
pip install sentence-transformers
pip install cohere
pip install --upgrade huggingface_hub

CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install  llama-cpp-python --no-cache-dir

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
import torch

documents = SimpleDirectoryReader("/content/data").load_data()

#intialize our custom LLM
import torch

from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf',
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

from llama_index.embeddings import HuggingFaceEmbedding

# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()

# loads BAAI/bge-small-en-v1.5
# intilaize our custom embeddings
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

service_context = ServiceContext.from_defaults(
    chunk_size=512,
    llm=llm,
    embed_model=embed_model
)

"""Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy"""

from llama_index.indices.postprocessor import SentenceTransformerRerank
# Initialize the reranker
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks

#create query engine
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine() # Without reranker

def predict(input, history):
  response = query_engine.query(input)
  return str(response)

#create query engine with cross encoder reranker
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks.

def predict(input, history):
  response = query_engine.query(input)
  return str(response)

import time
import gradio as gr

def predict(input, history):
    start_time = time.time()  # Start the timer

    response = query_engine.query(input)  # Process the query

    end_time = time.time()  # Stop the timer
    response_time = end_time - start_time  # Calculate the time taken

    # Format the response to include the time taken
    timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)"
    return str(timed_response)

# Launch gradio chat ui
gr.ChatInterface(predict).launch(share=True, debug=True)