karthikvarunn commited on
Commit
2247b0c
·
verified ·
1 Parent(s): 35db16f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -0
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.schema import HumanMessage
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_voyageai import VoyageAIEmbeddings
8
+ from langchain_pinecone import PineconeVectorStore
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain_core.output_parsers import StrOutputParser
12
+ from typing import List, Tuple
13
+ from langchain.schema import BaseRetriever
14
+ from langchain_core.documents import Document
15
+ from langchain_core.runnables import chain
16
+ import gradio as gr
17
+ from pinecone import Pinecone, ServerlessSpec
18
+ import openai
19
+
20
+ # Load environment variables
21
+ load_dotenv()
22
+
23
+ # Initialize OpenAI and Pinecone credentials
24
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
25
+ pinecone_api_key = os.environ.get("PINECONE_API_KEY")
26
+ pinecone_environment = os.environ.get("PINECONE_ENV")
27
+ voyage_api_key = os.environ.get("VOYAGE_API_KEY")
28
+ pinecone_index_name = "rag-proto011"
29
+
30
+ # Initialize Pinecone
31
+ pc = Pinecone(api_key=pinecone_api_key)
32
+ if pinecone_index_name not in pc.list_indexes().names():
33
+ pc.create_index(
34
+ name=pinecone_index_name,
35
+ dimension=1024, #1024- voyage-law-2, # '1536' is the dimension for ada-002 embeddings
36
+ metric='cosine',
37
+ spec=ServerlessSpec(
38
+ cloud='aws',
39
+ region=pinecone_environment
40
+ )
41
+ )
42
+ print("Pinecone Index provisioned")
43
+ else:
44
+ print("Pinecone Index already provisioned")
45
+
46
+ # Initialize embeddings
47
+ #embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
48
+ embeddings = VoyageAIEmbeddings(
49
+ voyage_api_key=voyage_api_key, model="voyage-law-2"
50
+ )
51
+
52
+ def search_documents(query):
53
+ try:
54
+ # Initialize the vector store and retriever
55
+ vector_store = PineconeVectorStore(index_name=pinecone_index_name, embedding=embeddings)
56
+
57
+ # Use maxMarginalRelevanceSearch to improve diversity in results
58
+ results = vector_store.max_marginal_relevance_search(query, k=7, fetch_k=20) # Adjust fetch_k for more diverse results
59
+
60
+ # Filter results to ensure uniqueness based on metadata.id
61
+ seen_ids = set()
62
+ unique_results = []
63
+ for result in results:
64
+ unique_id = result.metadata.get("id")
65
+ if unique_id and unique_id not in seen_ids:
66
+ seen_ids.add(unique_id)
67
+ unique_results.append(result)
68
+
69
+ # Collect relevant context from unique results
70
+ context = []
71
+ for result in unique_results:
72
+ context.append({
73
+ "doc_id": result.metadata.get("doc_id", "N/A"),
74
+ "chunk_id": result.metadata.get("id", "N/A"),
75
+ "title": result.metadata.get("source", "N/A"),
76
+ "relevant_text": result.page_content,
77
+ "page_number": result.metadata.get("page", "N/A"),
78
+ "score": result.metadata.get("score", 0.0), # Score might not be available in all libraries
79
+ })
80
+
81
+ # Combine the relevant text for additional processing, if needed
82
+ combined_context = "\n\n".join([res["relevant_text"] for res in context])
83
+ return context, combined_context
84
+ except Exception as e:
85
+ return [], f"Error searching documents: {str(e)}"
86
+
87
+
88
+ def generate_output(context, query):
89
+ try:
90
+ llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key, temperature=0.7)
91
+ prompt_template = PromptTemplate(
92
+ template="""
93
+ Use the following context to answer the question as accurately as possible:
94
+ Context: {context}
95
+ Question: {question}
96
+ Answer:""",
97
+ input_variables=["context", "question"]
98
+ )
99
+ prompt = prompt_template.format(context=context, question=query)
100
+ response = llm([HumanMessage(content=prompt)])
101
+ return response.content
102
+ except Exception as e:
103
+ return f"Error generating output: {str(e)}"
104
+
105
+ def complete_workflow(query):
106
+ try:
107
+ context_data, combined_context = search_documents(query)
108
+ #natural_language_output = generate_output(combined_context, query)
109
+
110
+ document_titles = list({os.path.basename(doc["title"]) for doc in context_data}) # Get only file names
111
+
112
+ formatted_titles = " " + "\n".join(document_titles)
113
+
114
+ results = {
115
+ "results": [
116
+ {
117
+ "natural_language_output": generate_output(doc["relevant_text"], query),
118
+ "doc_id": doc["doc_id"],
119
+ "chunk_id": doc["chunk_id"],
120
+ "title": doc["title"],
121
+ "relevant_text": doc["relevant_text"],
122
+ "page_number": doc["page_number"],
123
+ "score": doc["score"],
124
+ }
125
+ for doc in context_data
126
+ ]
127
+ }
128
+
129
+ return results, formatted_titles # Return results and formatted document titles
130
+ except Exception as e:
131
+ return {"results": []}, f"Error in workflow: {str(e)}"
132
+
133
+
134
+ def delete_index():
135
+ try:
136
+ pinecone_index_name = "rag-proto011"
137
+ if pinecone_index_name in pc.list_indexes().names():
138
+ pc.delete_index(name=pinecone_index_name)
139
+ return "Pinecone Index Deleted"
140
+ else:
141
+ return "Pinecone Index Had Already Been Deleted"
142
+ except Exception as e:
143
+ return f"Error deleting Pinecone index: {str(e)}"
144
+
145
+
146
+ def gradio_app():
147
+ with gr.Blocks(css=".result-output {width: 150%; font-size: 16px; padding: 10px;}") as app:
148
+ gr.Markdown("### Intelligent Document Search Prototype-v0.1.2 ")
149
+
150
+ with gr.Row():
151
+ user_query = gr.Textbox(label="Enter Your Search Query")
152
+ search_btn = gr.Button("Search")
153
+
154
+ with gr.Row():
155
+ result_output = gr.JSON(label="Search Results", elem_id="result-output")
156
+ with gr.Row():
157
+ titles_output = gr.Textbox(label="Document Titles", interactive=False) # New Textbox for Titles
158
+
159
+ search_btn.click(
160
+ complete_workflow,
161
+ inputs=user_query,
162
+ outputs=[result_output, titles_output],
163
+ )
164
+
165
+
166
+ return app
167
+ # Launch the app
168
+ gradio_app().launch()