Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,11 +13,11 @@ from typing import List, Tuple
|
|
13 |
from langchain.schema import BaseRetriever
|
14 |
from langchain_core.documents import Document
|
15 |
from langchain_core.runnables import chain
|
16 |
-
import gradio as gr
|
17 |
from pinecone import Pinecone, ServerlessSpec
|
18 |
import openai
|
19 |
-
from langchain.retrievers import BM25Retriever
|
20 |
import numpy as np
|
|
|
|
|
21 |
|
22 |
load_dotenv()
|
23 |
|
@@ -26,7 +26,7 @@ openai.api_key = os.environ.get("OPENAI_API_KEY")
|
|
26 |
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
|
27 |
pinecone_environment = os.environ.get("PINECONE_ENV")
|
28 |
voyage_api_key = os.environ.get("VOYAGE_API_KEY")
|
29 |
-
pinecone_index_name = "rag-
|
30 |
|
31 |
# Initialize Pinecone
|
32 |
pc = Pinecone(api_key=pinecone_api_key)
|
@@ -54,7 +54,7 @@ def search_documents(query):
|
|
54 |
try:
|
55 |
vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
|
56 |
|
57 |
-
results = vector_store.max_marginal_relevance_search(query, k=
|
58 |
|
59 |
# Filter results to ensure uniqueness based on metadata.id
|
60 |
seen_ids = set()
|
@@ -72,17 +72,30 @@ def search_documents(query):
|
|
72 |
"doc_id": result.metadata.get("doc_id", "N/A"),
|
73 |
"chunk_id": result.metadata.get("id", "N/A"),
|
74 |
"title": result.metadata.get("source", "N/A"),
|
75 |
-
"
|
76 |
"page_number": result.metadata.get("page", "N/A"),
|
77 |
"score": result.metadata.get("score", 0.0), # Score might not be available in all libraries
|
78 |
})
|
79 |
|
80 |
# Combine the relevant text for additional processing
|
81 |
-
combined_context = "\n\n".join([res["
|
82 |
return context, combined_context
|
83 |
except Exception as e:
|
84 |
return [], f"Error searching documents: {str(e)}"
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
def generate_output(context, query):
|
88 |
try:
|
@@ -104,6 +117,31 @@ def generate_output(context, query):
|
|
104 |
def complete_workflow(query):
|
105 |
try:
|
106 |
context_data, combined_context = search_documents(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
|
109 |
formatted_titles = " " + "\n".join(document_titles)
|
@@ -113,11 +151,11 @@ def complete_workflow(query):
|
|
113 |
results = {
|
114 |
"results": [
|
115 |
{
|
116 |
-
"natural_language_output": generate_output(doc["
|
117 |
"chunk_id": doc["chunk_id"],
|
118 |
"document_id": doc["doc_id"], # Assuming doc_id is the UUID
|
119 |
"title": doc["title"],
|
120 |
-
"
|
121 |
"page_number": doc["page_number"],
|
122 |
"score": doc["score"],
|
123 |
}
|
@@ -130,6 +168,7 @@ def complete_workflow(query):
|
|
130 |
except Exception as e:
|
131 |
return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
|
132 |
|
|
|
133 |
def gradio_app():
|
134 |
with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
|
135 |
gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
|
|
|
13 |
from langchain.schema import BaseRetriever
|
14 |
from langchain_core.documents import Document
|
15 |
from langchain_core.runnables import chain
|
|
|
16 |
from pinecone import Pinecone, ServerlessSpec
|
17 |
import openai
|
|
|
18 |
import numpy as np
|
19 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
20 |
+
|
21 |
|
22 |
load_dotenv()
|
23 |
|
|
|
26 |
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
|
27 |
pinecone_environment = os.environ.get("PINECONE_ENV")
|
28 |
voyage_api_key = os.environ.get("VOYAGE_API_KEY")
|
29 |
+
pinecone_index_name = "rag-proto011"
|
30 |
|
31 |
# Initialize Pinecone
|
32 |
pc = Pinecone(api_key=pinecone_api_key)
|
|
|
54 |
try:
|
55 |
vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
|
56 |
|
57 |
+
results = vector_store.max_marginal_relevance_search(query, k=10, fetch_k=30) # Adjust fetch_k for more diverse results
|
58 |
|
59 |
# Filter results to ensure uniqueness based on metadata.id
|
60 |
seen_ids = set()
|
|
|
72 |
"doc_id": result.metadata.get("doc_id", "N/A"),
|
73 |
"chunk_id": result.metadata.get("id", "N/A"),
|
74 |
"title": result.metadata.get("source", "N/A"),
|
75 |
+
"text": result.page_content,
|
76 |
"page_number": result.metadata.get("page", "N/A"),
|
77 |
"score": result.metadata.get("score", 0.0), # Score might not be available in all libraries
|
78 |
})
|
79 |
|
80 |
# Combine the relevant text for additional processing
|
81 |
+
combined_context = "\n\n".join([res["text"] for res in context])
|
82 |
return context, combined_context
|
83 |
except Exception as e:
|
84 |
return [], f"Error searching documents: {str(e)}"
|
85 |
|
86 |
+
# Reranker
|
87 |
+
def rerank(query, context):
|
88 |
+
result = pc.inference.rerank(
|
89 |
+
model="bge-reranker-v2-m3",
|
90 |
+
query=query,
|
91 |
+
documents=context,
|
92 |
+
top_n=5,
|
93 |
+
return_documents=True,
|
94 |
+
parameters={
|
95 |
+
"truncate": "END"
|
96 |
+
}
|
97 |
+
)
|
98 |
+
return result
|
99 |
|
100 |
def generate_output(context, query):
|
101 |
try:
|
|
|
117 |
def complete_workflow(query):
|
118 |
try:
|
119 |
context_data, combined_context = search_documents(query)
|
120 |
+
|
121 |
+
# print("Context Data")
|
122 |
+
|
123 |
+
# [print(doc) for doc in context_data]
|
124 |
+
|
125 |
+
reranked = rerank(query, context_data)
|
126 |
+
|
127 |
+
context_data= []
|
128 |
+
|
129 |
+
# print("\n\n reranked data")
|
130 |
+
# print(reranked.data)
|
131 |
+
|
132 |
+
for i, entry in enumerate(reranked.data): # Access the 'data' attribute
|
133 |
+
context_data.append({
|
134 |
+
'chunk_id': entry['document']['chunk_id'],
|
135 |
+
'doc_id': entry['document']['doc_id'],
|
136 |
+
'title': entry['document']['title'],
|
137 |
+
'text': entry['document']['text'],
|
138 |
+
'page_number': entry['document']['page_number'],
|
139 |
+
'score': entry['score']
|
140 |
+
})
|
141 |
+
|
142 |
+
# print("\n\n New Context Data")
|
143 |
+
# [print(doc) for doc in context_data]
|
144 |
+
|
145 |
|
146 |
document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
|
147 |
formatted_titles = " " + "\n".join(document_titles)
|
|
|
151 |
results = {
|
152 |
"results": [
|
153 |
{
|
154 |
+
"natural_language_output": generate_output(doc["text"], query),
|
155 |
"chunk_id": doc["chunk_id"],
|
156 |
"document_id": doc["doc_id"], # Assuming doc_id is the UUID
|
157 |
"title": doc["title"],
|
158 |
+
"text": doc["text"],
|
159 |
"page_number": doc["page_number"],
|
160 |
"score": doc["score"],
|
161 |
}
|
|
|
168 |
except Exception as e:
|
169 |
return {"results": [], "total_results": 0}, f"Error in workflow: {str(e)}"
|
170 |
|
171 |
+
|
172 |
def gradio_app():
|
173 |
with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
|
174 |
gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
|