karthikvarunn commited on
Commit
ced5c1e
·
verified ·
1 Parent(s): 28ca8af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -8
app.py CHANGED
@@ -13,11 +13,11 @@ from typing import List, Tuple
13
  from langchain.schema import BaseRetriever
14
  from langchain_core.documents import Document
15
  from langchain_core.runnables import chain
16
- import gradio as gr
17
  from pinecone import Pinecone, ServerlessSpec
18
  import openai
19
- from langchain.retrievers import BM25Retriever
20
  import numpy as np
 
 
21
 
22
  load_dotenv()
23
 
@@ -26,7 +26,7 @@ openai.api_key = os.environ.get("OPENAI_API_KEY")
26
  pinecone_api_key = os.environ.get("PINECONE_API_KEY")
27
  pinecone_environment = os.environ.get("PINECONE_ENV")
28
  voyage_api_key = os.environ.get("VOYAGE_API_KEY")
29
- pinecone_index_name = "rag-proto012"
30
 
31
  # Initialize Pinecone
32
  pc = Pinecone(api_key=pinecone_api_key)
@@ -54,7 +54,7 @@ def search_documents(query):
54
  try:
55
  vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
56
 
57
- results = vector_store.max_marginal_relevance_search(query, k=7, fetch_k=20) # Adjust fetch_k for more diverse results
58
 
59
  # Filter results to ensure uniqueness based on metadata.id
60
  seen_ids = set()
@@ -72,17 +72,30 @@ def search_documents(query):
72
  "doc_id": result.metadata.get("doc_id", "N/A"),
73
  "chunk_id": result.metadata.get("id", "N/A"),
74
  "title": result.metadata.get("source", "N/A"),
75
- "relevant_text": result.page_content,
76
  "page_number": result.metadata.get("page", "N/A"),
77
  "score": result.metadata.get("score", 0.0), # Score might not be available in all libraries
78
  })
79
 
80
  # Combine the relevant text for additional processing
81
- combined_context = "\n\n".join([res["relevant_text"] for res in context])
82
  return context, combined_context
83
  except Exception as e:
84
  return [], f"Error searching documents: {str(e)}"
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def generate_output(context, query):
88
  try:
@@ -104,6 +117,31 @@ def generate_output(context, query):
104
  def complete_workflow(query):
105
  try:
106
  context_data, combined_context = search_documents(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
109
  formatted_titles = " " + "\n".join(document_titles)
@@ -113,11 +151,11 @@ def complete_workflow(query):
113
  results = {
114
  "results": [
115
  {
116
- "natural_language_output": generate_output(doc["relevant_text"], query),
117
  "chunk_id": doc["chunk_id"],
118
  "document_id": doc["doc_id"], # Assuming doc_id is the UUID
119
  "title": doc["title"],
120
- "relevant_text": doc["relevant_text"],
121
  "page_number": doc["page_number"],
122
  "score": doc["score"],
123
  }
@@ -130,6 +168,7 @@ def complete_workflow(query):
130
  except Exception as e:
131
  return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
132
 
 
133
  def gradio_app():
134
  with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
135
  gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
 
13
  from langchain.schema import BaseRetriever
14
  from langchain_core.documents import Document
15
  from langchain_core.runnables import chain
 
16
  from pinecone import Pinecone, ServerlessSpec
17
  import openai
 
18
  import numpy as np
19
+ from pinecone.grpc import PineconeGRPC as Pinecone
20
+
21
 
22
  load_dotenv()
23
 
 
26
  pinecone_api_key = os.environ.get("PINECONE_API_KEY")
27
  pinecone_environment = os.environ.get("PINECONE_ENV")
28
  voyage_api_key = os.environ.get("VOYAGE_API_KEY")
29
+ pinecone_index_name = "rag-proto011"
30
 
31
  # Initialize Pinecone
32
  pc = Pinecone(api_key=pinecone_api_key)
 
54
  try:
55
  vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
56
 
57
+ results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=30) # Adjust fetch_k for more diverse results
58
 
59
  # Filter results to ensure uniqueness based on metadata.id
60
  seen_ids = set()
 
72
  "doc_id": result.metadata.get("doc_id", "N/A"),
73
  "chunk_id": result.metadata.get("id", "N/A"),
74
  "title": result.metadata.get("source", "N/A"),
75
+ "text": result.page_content,
76
  "page_number": result.metadata.get("page", "N/A"),
77
  "score": result.metadata.get("score", 0.0), # Score might not be available in all libraries
78
  })
79
 
80
  # Combine the relevant text for additional processing
81
+ combined_context = "\n\n".join([res["text"] for res in context])
82
  return context, combined_context
83
  except Exception as e:
84
  return [], f"Error searching documents: {str(e)}"
85
 
86
+ # Reranker
87
+ def rerank(query, context):
88
+ result = pc.inference.rerank(
89
+ model="bge-reranker-v2-m3",
90
+ query=query,
91
+ documents=context,
92
+ top_n=5,
93
+ return_documents=True,
94
+ parameters={
95
+ "truncate": "END"
96
+ }
97
+ )
98
+ return result
99
 
100
  def generate_output(context, query):
101
  try:
 
117
  def complete_workflow(query):
118
  try:
119
  context_data, combined_context = search_documents(query)
120
+
121
+ # print("Context Data")
122
+
123
+ # [print(doc) for doc in context_data]
124
+
125
+ reranked = rerank(query, context_data)
126
+
127
+ context_data= []
128
+
129
+ # print("\n\n reranked data")
130
+ # print(reranked.data)
131
+
132
+ for i, entry in enumerate(reranked.data): # Access the 'data' attribute
133
+ context_data.append({
134
+ 'chunk_id': entry['document']['chunk_id'],
135
+ 'doc_id': entry['document']['doc_id'],
136
+ 'title': entry['document']['title'],
137
+ 'text': entry['document']['text'],
138
+ 'page_number': entry['document']['page_number'],
139
+ 'score': entry['score']
140
+ })
141
+
142
+ # print("\n\n New Context Data")
143
+ # [print(doc) for doc in context_data]
144
+
145
 
146
  document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
147
  formatted_titles = " " + "\n".join(document_titles)
 
151
  results = {
152
  "results": [
153
  {
154
+ "natural_language_output": generate_output(doc["text"], query),
155
  "chunk_id": doc["chunk_id"],
156
  "document_id": doc["doc_id"], # Assuming doc_id is the UUID
157
  "title": doc["title"],
158
+ "text": doc["text"],
159
  "page_number": doc["page_number"],
160
  "score": doc["score"],
161
  }
 
168
  except Exception as e:
169
  return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
170
 
171
+
172
  def gradio_app():
173
  with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
174
  gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")