karthikvarunn commited on
Commit
20b492a
·
verified ·
1 Parent(s): f854559

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py CHANGED
@@ -16,6 +16,8 @@ from langchain_core.runnables import chain
16
  import gradio as gr
17
  from pinecone import Pinecone, ServerlessSpec
18
  import openai
 
 
19
 
20
  load_dotenv()
21
 
@@ -48,6 +50,79 @@ embeddings = VoyageAIEmbeddings(
48
  voyage_api_key=voyage_api_key, model="voyage-law-2"
49
  )
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def search_documents(query):
52
  try:
53
  vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
@@ -127,6 +202,53 @@ def complete_workflow(query):
127
  return {"results": []}, f"Error in workflow: {str(e)}"
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def gradio_app():
131
  with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
132
  gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
 
16
  import gradio as gr
17
  from pinecone import Pinecone, ServerlessSpec
18
  import openai
19
+ from langchain.retrievers import BM25Retriever
20
+ import numpy as np
21
 
22
  load_dotenv()
23
 
 
50
  voyage_api_key=voyage_api_key, model="voyage-law-2"
51
  )
52
 
53
+ def hybrid_search_documents(query):
54
+ try:
55
+ vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
56
+
57
+ vector_results = vector_store.similarity_search_with_score(query, k=15) # Fetch top 15 results
58
+
59
+ bm25_retriever = BM25Retriever.from_documents(uploaded_docs)
60
+
61
+ keyword_results = bm25_retriever.get_relevant_documents(query)[:10] # Fetch top 10 keyword-based results
62
+
63
+ # Combine results while avoiding duplicates
64
+ seen_ids = set()
65
+ hybrid_results = []
66
+
67
+ def process_result(result, score, method):
68
+ unique_id = result.metadata.get("id")
69
+ if unique_id not in seen_ids:
70
+ seen_ids.add(unique_id)
71
+ hybrid_results.append({
72
+ "doc_id": result.metadata.get("doc_id", "N/A"),
73
+ "chunk_id": unique_id,
74
+ "title": result.metadata.get("source", "N/A"),
75
+ "relevant_text": result.page_content,
76
+ "page_number": result.metadata.get("page", "N/A"),
77
+ "score": score,
78
+ "method": method # Vector or BM25
79
+ })
80
+
81
+ # Add dense results
82
+ for res, score in vector_results:
83
+ process_result(res, score, "vector")
84
+
85
+ # Add BM25 results with an arbitrary score
86
+ for res in keyword_results:
87
+ process_result(res, score=0.85, method="bm25") # BM25 scores aren't normalized, so we use an approximation
88
+
89
+ # 🔹 Step 3: Re-Ranking with LLM (GPT-4)
90
+ llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.3)
91
+
92
+ ranking_prompt = """
93
+ You are a document retrieval assistant. Given the following query and retrieved documents,
94
+ rank them based on their relevance to the query.
95
+
96
+ Query: {query}
97
+
98
+ Documents:
99
+ {documents}
100
+
101
+ Return a ranked list of document IDs in order of relevance.
102
+ """
103
+
104
+ doc_texts = "\n".join([f"ID: {doc['chunk_id']}, Text: {doc['relevant_text']}" for doc in hybrid_results])
105
+ prompt = ranking_prompt.format(query=query, documents=doc_texts)
106
+ response = llm([HumanMessage(content=prompt)]).content.strip()
107
+
108
+ # Extract ordered ranking from LLM response
109
+ ordered_ids = response.split("\n") # Assuming LLM returns sorted IDs line-by-line
110
+ hybrid_results = sorted(hybrid_results, key=lambda x: ordered_ids.index(x["chunk_id"]) if x["chunk_id"] in ordered_ids else 999)
111
+
112
+ # Normalize Scores for Consistency
113
+ scores = [doc["score"] for doc in hybrid_results]
114
+ min_score, max_score = min(scores), max(scores)
115
+ for doc in hybrid_results:
116
+ doc["score"] = (doc["score"] - min_score) / (max_score - min_score + 1e-6) # Normalize scores between 0 and 1
117
+
118
+ # Combine context for query generation
119
+ combined_context = "\n\n".join([res["relevant_text"] for res in hybrid_results])
120
+
121
+ return hybrid_results, combined_context
122
+
123
+ except Exception as e:
124
+ return [], f"Error in hybrid search: {str(e)}"
125
+
126
  def search_documents(query):
127
  try:
128
  vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
 
202
  return {"results": []}, f"Error in workflow: {str(e)}"
203
 
204
 
205
+ import os
206
+ from langchain_openai import ChatOpenAI
207
+ from langchain.schema import HumanMessage
208
+
209
+ def complete_workflow(query):
210
+ try:
211
+ # 🔹 Step 1: Perform Hybrid Search (Vector + BM25)
212
+ context_data, combined_context = hybrid_search_documents(query)
213
+
214
+ # 🔹 Step 2: Generate LLM-based Natural Language Output
215
+ llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.7)
216
+ prompt_template = """
217
+ Use the following context to answer the question as accurately as possible:
218
+
219
+ Context: {context}
220
+ Question: {question}
221
+
222
+ Answer:
223
+ """
224
+ prompt = prompt_template.format(context=combined_context, question=query)
225
+ response = llm([HumanMessage(content=prompt)])
226
+
227
+ # 🔹 Step 3: Format Results
228
+ document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Extract unique file names
229
+ formatted_titles = "\n".join(document_titles)
230
+
231
+ results = {
232
+ "results": [
233
+ {
234
+ "natural_language_output": response.content,
235
+ "chunk_id": doc["chunk_id"],
236
+ "document_id": doc["doc_id"],
237
+ "title": doc["title"],
238
+ "relevant_text": doc["relevant_text"],
239
+ "page_number": doc["page_number"],
240
+ "score": doc["score"],
241
+ "method": doc["method"], # "vector" or "bm25"
242
+ }
243
+ for doc in context_data
244
+ ],
245
+ "total_results": len(context_data), # Return total number of retrieved results
246
+ }
247
+
248
+ return results, formatted_titles # Return both results and formatted document titles
249
+ except Exception as e:
250
+ return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
251
+
252
  def gradio_app():
253
  with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
254
  gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")