Spaces:
Running
Running
File size: 9,596 Bytes
27bbfe3 f24b674 140b902 68acb0b 27bbfe3 68acb0b 27bbfe3 140b902 27bbfe3 610a5a3 27bbfe3 140b902 27bbfe3 68acb0b 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d ac85144 76ed729 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 6c6d21d 27bbfe3 76ed729 27bbfe3 ac85144 27bbfe3 bb56e41 27bbfe3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
# -*- coding: utf-8 -*-
import pandas as pd
import chromadb
intro_message = 'Hello! 😊 ask me questions about Inurance Policy. \n\nSample questions you can ask are \'What is the name of policy?\', \'What are the different insurance offered?\' etc. \n\nYou can ask these questions by tapping and holding the mic.'
def get_system_msg():
"""
Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
"""
system_msg = [
f"""
You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
The document name is 'Group Life Insurance Policy' and it contais information about 3 different insurance policies 'Member Life Insurance', 'Member Accidental Death and Dismemberment Insurance' and 'Dependent Life Insurance'.
Your task is to extract and present relevant information from the policy documents to answer the user’s query. The document excerpts are provided in the dataframe, with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
<EXAMPLE>
INPUT: "What are the premium rates for different types of insurance under this policy?"
OUTPUT:
The premium rate(s) for each Member insured for Life Insurance will be:
Premium Rates:
1. Member Life Insurance: $0.210 for each $1,000 of insurance in force.
2. Member Accidental Death and Dismemberment Insurance: $0.025 for each $1,000 of Member Life Insurance in force.
3. Dependent Life Insurance: $1.46 for each Member insured for Dependent Life Insurance.
Multiple Policy Discount: The Policyholder may be eligible for a multiple policy discount if they have at least two other eligible group insurance policies underwritten by The Principal.
Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
</EXAMPLE>
<EXAMPLE>
INPUT: "What are the Contributions from Members?"
OUTPUT:
Members are not required to contribute a part of the premium for their Member insurance under this Group Policy.
Members are required to contribute a part of the premium for their Dependent's insurance under this Group Policy.
Citations: Policy Name: Group Life Insurance Policy, Page Number: 20.
</EXAMPLE>
Guidelines:
1. Extract information that directly answers the user's query from the document excerpts.
3. Provide the final response well-formatted and easily readable text along with the citation.
4. Provide your complete response using the relevant parts in the documents.
5. The generated response should answer the query directly addressing the user and avoiding additional information about how you work.
6. If the provided excerpts do not fully answer the query, provide partial information and suggest which sections of the policy document the user should review for further details.
7. If no relevant information is found in the provided excerpts, respond with 'No relevant information found in the provided excerpts.'
"""
]
return system_msg
# def get_welcome_msg():
# """
# Generate a welcome msg.
# """
# messages = f"""
# Start the session with a short welcome message which also has policy name and a smiley.
# """
# introduction = [{"role": "user", "parts": messages}]
# return introduction
# Import the SentenceTransformer Embedding Function into chroma
from chromadb.utils import embedding_functions
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="paraphrase-mpnet-base-v2")
# embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1")
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
# Call PersistentClient() so the collections including cache can be stored in a permanent storage
client = chromadb.PersistentClient()
"""
We will also implement a data/collection cache to improve the performance of the overall search system."""
# Set up the embedding function
def generate_embeddings(embedding_function):
# Initialise a collection in chroma and pass the embedding_function to it so that it uses embedding model to embed the documents
insurance_collection = client.get_collection(name='RAG_on_Insurance', embedding_function=embedding_function)
return insurance_collection
insurance_collection = generate_embeddings(embedding_function)
"""##<font color = yellow> Search Layer
### Semantic Search with Cache
We will perform a semantic search of a query in the collections embeddings to get several top semantically similar results based on the *distance* parameter.
"""
# test query
# query = "What are the premium rates for different types of insurance under this policy?"
# query = "what are the benefits payable for different types of insurance under this policy?"
# query = "What are the Contributions from Members??"
"""#### Document retreival"""
# Implementing Cache in Semantic Search
def retreive_results(query):
# Set a threshold for cache search
threshold = 0.2
ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()
# If the distance is greater than the threshold, then return the results from the main collection.
# Query the collection against the user query and return the top 10 results
results = insurance_collection.query(
query_texts=query,
n_results=10
)
# Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
# Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
Keys = []
Values = []
for key, val in results.items():
if val is None:
continue
if key in ['ids', 'metadatas', 'documents', 'distances']:
for i in range(10):
Keys.append(str(key)+str(i))
Values.append(str(val[0][i]))
result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
results_df = pd.DataFrame.from_dict(result_dict)
return results_df
# results_df = retreive_results(query, insurance_collection, cache_collection)
# results_df.head(5)
"""#### Re-Ranking with a Cross Encoder
We will perform Re-ranking of the search results using cross-encoder to move more relevant chunks at the top.
"""
# Import the CrossEncoder library from sentence_transformers
from sentence_transformers import CrossEncoder, util
# Initialise the cross encoder model
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
#function to re-rank results using cross-encoder
def rerank_with_cross_encoder(query, results_df, top_k=3):
# Input (query, response) pairs for each of the top 10 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs
cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)
# print(cross_rerank_scores)
# Store the rerank_scores in results_df
results_df['Reranked_scores'] = cross_rerank_scores
# print(results_df)
# Return the top_kresults from semantic search
top_semantic = results_df.sort_values(by='Distances')
# print(top_semantic[:top_k])
# Return the top_k results after reranking
top_ranks_df = results_df.sort_values(by='Reranked_scores', ascending=False)
# print(top_ranks[:top_k])
top_docs = top_ranks_df[["Documents", "Metadatas"]][:top_k]
# top_ranks = top_ranks[:][:top_k]
# print(top_docs)
return top_docs #, top_ranks_df
# top_docs = rerank_with_cross_encoder(results_df)
# top_docs
def generate_response(query, top_docs):
"""
Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
"""
messages = f"""
Remember your system message and that you are a helpful assistant that extracts relevant information from insurance policy documents to answer user queries accurately and concisely.
Your task is to extract and present relevant information from the policy documents to answer the user’s query.
The document excerpts are provided in the dataframe '{top_docs}', with the actual policy text in the 'documents' column and metadata (page numbers) in the 'metadata' column.
The user input is: '{query}'
"""
# response = openai.chat.completions.create (
# model="gpt
### Your Task:-3.5-turbo",
# messages=messages
# )
conversation = [{"role": "user", "parts": messages}]
return conversation #response.choices[0].message.content.split('\n')
# response = generate_response(query, top_docs)
# print(query + '\n')
# print("\n".join(response))
|