Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pdfplumber | |
import os | |
from langchain.schema import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Pinecone | |
import pinecone | |
import pandas as pd | |
import time | |
from pinecone.grpc import PineconeGRPC as Pinecone | |
from pinecone import ServerlessSpec | |
from langchain_pinecone import PineconeVectorStore | |
from datetime import datetime | |
import os | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import Pinecone | |
from typing import TypedDict,List | |
from langgraph.graph import StateGraph | |
from langgraph.prebuilt import ToolNode | |
from langchain.schema import Document | |
from langchain.prompts import PromptTemplate | |
from langchain.tools import Tool | |
from langchain.llms import OpenAI | |
# OpenAI API key | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
# Embedding using OpenAI | |
embeddings = OpenAIEmbeddings(api_key=openai_api_key) | |
# Initialize Pinecone with PineconeGRPC | |
from pinecone import Pinecone | |
# pc = Pinecone(api_key=os.environ['PINECONE_API_KEY']) | |
# # Define index name and parameters | |
# index_name = "italy-kg" | |
# vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings) | |
#Dynamic Pinecone Creation | |
# Function to initialize Pinecone dynamically and create index if it doesn't exist | |
def init_pinecone(api_key, index_name): | |
pinecone.init(api_key=api_key, environment="us-east-1") | |
pc = Pinecone(api_key=api_key) | |
# Check if index exists, create if not | |
if index_name not in pc.list_indexes(): | |
pc.create_index( | |
name=index_name, | |
dimension=1536, | |
metric="cosine", | |
spec=ServerlessSpec( | |
cloud="aws", | |
region="us-east-1" | |
), | |
deletion_protection="disabled" | |
) | |
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings) | |
return vectorstore | |
llm=OpenAI(temperature=0,openai_api_key=openai_api_key) | |
# Tool functions | |
def search_vector_db(query: str, k: int = 3) -> List[Document]: | |
docs = vectorstore.similarity_search(query, k=k) | |
return docs | |
def expand_query(query: str) -> str: | |
return query | |
def summarize_context(context: str) -> str: | |
prompt = PromptTemplate(template="""Summarize the following Context to provide a concise overview: {context}""") | |
summary = llm(prompt.format(context=context)) | |
return summary.strip() | |
def generate_response(context: str, query: str) -> str: | |
prompt = PromptTemplate(template="""Question: {question}\nContext: {context}\nAnswer:""") | |
formatted_prompt = prompt.format(context=context, question=query) | |
response = llm(formatted_prompt) | |
return response.strip() | |
# Tool objects | |
expand_tool = Tool( | |
name="Expand Query", | |
func=expand_query, | |
description="Enhance the query with additional terms or context" | |
) | |
summarize_tool = Tool( | |
name="Summarize Context", | |
func=summarize_context, | |
description="Summarize the context to provide a concise overview" | |
) | |
search_tool = Tool( | |
name="Search Vector Database", | |
func=search_vector_db, | |
description="Search the vector database for relevant information" | |
) | |
generate_tool = Tool( | |
name="Generate Response", | |
func=generate_response, | |
description="Generate a response based on the context and query" | |
) | |
# State for the graph | |
class State(TypedDict): | |
question: str | |
context: List[Document] | |
response: str | |
expanded_query: str | |
summarized_context: str | |
# Workflow node definitions | |
def expand(state: State) -> State: | |
state["expanded_query"] = expand_tool.func(state["question"]) # Expand the query | |
return state | |
def search(state: State) -> State: | |
results = search_tool.func(state["expanded_query"]) # Search using the expanded query | |
state["context"] = results | |
print(f"Retrieved Documents: {[doc.page_content[:100] for doc in results]}") | |
return state | |
def summarize(state: State) -> State: | |
context = " ".join(doc.page_content for doc in state["context"]) if state["context"] else "" | |
state["summarized_context"] = summarize_tool.func(context) | |
print(f"Summarized Context: {state['summarized_context']}") | |
return state | |
def generate(state: State) -> State: | |
response = generate_tool.func(state["summarized_context"], state["question"]) | |
state["response"] = response | |
print(f"Generated Response: {state['response']}") | |
return state | |
# Workflow graph | |
workflow = StateGraph(State) | |
workflow.add_node("expand", expand) | |
workflow.add_node("search", search) | |
workflow.add_node("summarize", summarize) | |
workflow.add_node("generate", generate) | |
workflow.set_entry_point("expand") | |
workflow.add_edge("expand", "search") | |
workflow.add_edge("search", "summarize") | |
workflow.add_edge("summarize", "generate") | |
workflow.set_finish_point("generate") | |
graph = workflow.compile() | |
# Function to run the graph | |
def run_graph(question: str): | |
result = graph.invoke({"question": question}) | |
return result["response"] | |
# Function to clear the input and response | |
def clear_inputs(): | |
return "", "" # Return empty strings for both the query input and response output | |
# Create a global list to store uploaded document records | |
uploaded_documents = [] | |
# Function to process PDF, extract text, split it into chunks, and upload to the vector DB | |
def process_pdf(pdf_file, uploaded_documents): | |
if pdf_file is None: | |
return uploaded_documents, "No PDF file uploaded." | |
with pdfplumber.open(pdf_file.name) as pdf: | |
all_text = "" | |
for page in pdf.pages: | |
all_text += page.extract_text() | |
# Split the text into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) | |
chunks = text_splitter.split_text(all_text) | |
# Embed and upload the chunks into the vector database | |
chunk_ids = [] | |
for chunk in chunks: | |
document = Document(page_content=chunk) | |
chunk_id = vectorstore.add_documents([document]) | |
chunk_ids.append(chunk_id) | |
# Update the upload history | |
document_record = { | |
"Document Name": pdf_file.name, | |
"Upload Time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
"Chunks": len(chunks), | |
"Pinecone Index": index_name | |
} | |
# Add the record to the global list | |
uploaded_documents.append(document_record) | |
# Convert the list of dictionaries into a list of lists for the dataframe | |
table_data = [[doc["Document Name"], doc["Upload Time"], doc["Chunks"], doc["Pinecone Index"]] for doc in uploaded_documents] | |
return table_data, f"Uploaded {len(chunks)} chunks to the vector database." | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
# Add Pinecone Index and API Key fields side by side | |
pinecone_index_input = gr.Textbox(label="Pinecone Index Name", placeholder="Enter Pinecone Index Name") | |
with gr.Column(): | |
pinecone_api_key_input = gr.Textbox(label="Pinecone API Key", placeholder="Enter Pinecone API Key") | |
with gr.Row(): | |
with gr.Column(): | |
response_output = gr.Textbox(label="Response:", lines=10, max_lines=10) | |
query_input = gr.Textbox(label="Enter your query:") | |
with gr.Row(): | |
query_button = gr.Button("Get Response") | |
clear_button = gr.Button("Clear") # New Clear button | |
query_button.click(fn=run_graph, inputs=query_input, outputs=response_output) | |
clear_button.click(fn=clear_inputs, inputs=[], outputs=[query_input, response_output]) # Clear both input and output | |
with gr.Column(): | |
file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
document_table = gr.Dataframe(headers=["Document Name", "Upload Time", "Chunks", "Pinecone Index"], interactive=False) | |
output_textbox = gr.Textbox(label="Result") | |
process_button = gr.Button("Process PDF and Upload") | |
process_button.click(fn=process_pdf, inputs=[file_input, gr.State([])], outputs=[document_table, output_textbox]) | |
# When the process button is clicked, dynamically initialize Pinecone with API key and index name | |
def process_with_dynamic_pinecone(pdf_file, uploaded_documents, pinecone_index_name, pinecone_api_key): | |
vectorstore = init_pinecone(pinecone_api_key, pinecone_index_name) | |
return process_pdf(pdf_file, uploaded_documents, vectorstore) | |
process_button.click(fn=process_with_dynamic_pinecone, | |
inputs=[file_input, gr.State([]), pinecone_index_input, pinecone_api_key_input], | |
outputs=[document_table, output_textbox]) | |
demo.launch(show_error=True) |