Divyansh12 commited on
Commit
4625115
·
verified ·
1 Parent(s): 5cd27c3

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .env +3 -0
  2. Data/SDG.pdf +0 -0
  3. README.md +2 -8
  4. app.py +165 -0
  5. requirements.txt +10 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GROQ_API_KEY="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"
2
+ HUGGINGFACE_API_KEY=""
3
+ PINECONE_API_KEY="pcsk_4x6rrL_JWddywVmVcd16ijWofHRBRkV3epTLGyVcqQHZBzo5263AxXP7d46ruR1TYPwc5x"
Data/SDG.pdf ADDED
Binary file (272 kB). View file
 
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: PDF Insights QA
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.10.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: PDF_Insights_QA
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.10.0
 
 
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import nest_asyncio
4
+ import pinecone
5
+ import time
6
+ from dotenv import find_dotenv, load_dotenv
7
+ from langchain_groq import ChatGroq
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_core.output_parsers import StrOutputParser
10
+ from langchain_core.prompts import PromptTemplate
11
+ from langchain_core.runnables import RunnablePassthrough, RunnableParallel
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
14
+ from langchain_pinecone import PineconeVectorStore
15
+ from pinecone import Pinecone, ServerlessSpec
16
+ import gradio as gr
17
+ from langchain import hub
18
+
19
+ # Allow nested async calls
20
+ nest_asyncio.apply()
21
+
22
+ # Load environment variables
23
+ _ = load_dotenv(find_dotenv())
24
+ os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
25
+ os.environ["HUGGINGFACE_API_KEY"] = os.getenv("HUGGINGFACE_API_KEY")
26
+ os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
27
+
28
+ # Initialize Pinecone
29
+ pc = Pinecone()
30
+ index_name = "intern"
31
+
32
+ existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
33
+
34
+ if index_name not in existing_indexes:
35
+ print(f"Creating new index: {index_name}")
36
+ pc.create_index(
37
+ name=index_name,
38
+ dimension=384,
39
+ metric="cosine",
40
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
41
+ )
42
+ while not pc.describe_index(index_name).status["ready"]:
43
+ time.sleep(1)
44
+
45
+ index = pc.Index(index_name)
46
+
47
+ # embeddings model
48
+ print("Initializing embedding model...")
49
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
50
+
51
+ # Load and split documents
52
+ print("Loading documents...")
53
+ loader = PyPDFDirectoryLoader("Data")
54
+ documents = loader.load()
55
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
56
+ docs = text_splitter.split_documents(documents)
57
+
58
+ def are_documents_indexed(index):
59
+ try:
60
+ # Create a simple test embedding
61
+ test_embedding = embedding_model.embed_query("test")
62
+ # Query the index
63
+ results = index.query(vector=test_embedding, top_k=1)
64
+ return len(results.matches) > 0
65
+ except Exception as e:
66
+ print(f"Error checking indexed documents: {e}")
67
+ return False
68
+
69
+ # Initialize vector store
70
+ print("Initializing vector store...")
71
+ vector_store = PineconeVectorStore(index=index, embedding=embedding_model)
72
+
73
+ # Add documents only if not already indexed
74
+ print("Checking if documents are already indexed...")
75
+ if not are_documents_indexed(index):
76
+ print("Adding documents to index...")
77
+ vector_store.add_documents(docs)
78
+ print("Documents added successfully!")
79
+ else:
80
+ print("Documents are already indexed.")
81
+
82
+ print("Setting up retriever and LLM...")
83
+ retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
84
+ llm = ChatGroq(model="llama3-8b-8192", temperature=0.7, max_retries=4)
85
+ str_output_parser = StrOutputParser()
86
+
87
+ prompt = hub.pull("jclemens24/rag-prompt")
88
+
89
+ relevance_prompt_template = PromptTemplate.from_template(
90
+ """
91
+ Given the following question and retrieved context, determine if the context is relevant to the question.
92
+ Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
93
+ Return ONLY the numeric score, without any additional text or explanation.
94
+
95
+ Question: {question}
96
+ Retrieved Context: {retrieved_context}
97
+
98
+ Relevance Score:"""
99
+ )
100
+
101
+ def format_docs(docs):
102
+ return "\n\n".join(doc.page_content for doc in docs)
103
+
104
+ def extract_score(llm_output):
105
+ try:
106
+ return float(llm_output.strip())
107
+ except ValueError:
108
+ return 0
109
+
110
+ def conditional_answer(x):
111
+ relevance_score = extract_score(x["relevance_score"])
112
+ return "I don't know." if relevance_score < 4 else x["answer"]
113
+
114
+ # RAG pipeline
115
+ rag_chain_from_docs = (
116
+ RunnablePassthrough.assign(context=lambda x: format_docs(x["context"]))
117
+ | RunnableParallel(
118
+ {
119
+ "relevance_score": (
120
+ RunnablePassthrough()
121
+ | (lambda x: relevance_prompt_template.format(question=x["question"], retrieved_context=x["context"]))
122
+ | llm
123
+ | str_output_parser
124
+ ),
125
+ "answer": (
126
+ RunnablePassthrough()
127
+ | prompt
128
+ | llm
129
+ | str_output_parser
130
+ ),
131
+ }
132
+ )
133
+ | RunnablePassthrough().assign(final_answer=conditional_answer)
134
+ )
135
+
136
+ rag_chain_with_source = RunnableParallel(
137
+ {"context": retriever, "question": RunnablePassthrough()}
138
+ ).assign(answer=rag_chain_from_docs)
139
+
140
+ async def process_question(question):
141
+ try:
142
+ result = await rag_chain_with_source.ainvoke(question)
143
+ final_answer = result["answer"]["final_answer"]
144
+ sources = [doc.metadata.get("source") for doc in result["context"]]
145
+ source_list = ", ".join(sources)
146
+ return final_answer, source_list
147
+ except Exception as e:
148
+ return f"Error: {str(e)}", "Error retrieving sources"
149
+
150
+ # Gradio
151
+ print("Gradio interface...")
152
+ demo = gr.Interface(
153
+ fn=process_question,
154
+ inputs=gr.Textbox(label="Enter your question", value=""),
155
+ outputs=[
156
+ gr.Textbox(label="Answer"),
157
+ gr.Textbox(label="Sources"),
158
+ ],
159
+ title="RAG Question Answering",
160
+ description="Enter a question and get an answer from the PDFs.",
161
+ )
162
+
163
+ if __name__ == "__main__":
164
+ print("Launching the application...")
165
+ demo.queue().launch(share=True,debug=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ langchain
3
+ langchain-text-splitters
4
+ langchain-huggingface
5
+ langchain-groq
6
+ python-dotenv
7
+ langchain_community
8
+ pypdf
9
+ gradio
10
+ langchain-pinecone