Pijush2023 commited on
Commit
3ce55e9
·
verified ·
1 Parent(s): d0253c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -4
app.py CHANGED
@@ -33,6 +33,8 @@ import torchaudio
33
  import numpy as np
34
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
35
  from langchain_huggingface import HuggingFaceEmbeddings
 
 
36
 
37
 
38
  # Neo4j imports
@@ -126,12 +128,12 @@ gpt4o_mini_model = initialize_gpt4o_mini_model()
126
 
127
  # Existing embeddings and vector store for GPT-4o
128
  gpt_embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
129
- gpt_vectorstore = PineconeVectorStore(index_name="italyopenai", embedding=gpt_embeddings)
130
  gpt_retriever = gpt_vectorstore.as_retriever(search_kwargs={'k': 5})
131
 
132
  # New vector store setup for Phi-3.5
133
  phi_embeddings = embeddings
134
- phi_vectorstore = PineconeVectorStore(index_name="italyopenai", embedding=embeddings)
135
  phi_retriever = phi_vectorstore.as_retriever(search_kwargs={'k': 5})
136
 
137
 
@@ -142,7 +144,8 @@ phi_retriever = phi_vectorstore.as_retriever(search_kwargs={'k': 5})
142
  from pinecone import Pinecone
143
  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
144
 
145
- index_name = "italyopenai"
 
146
  vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
147
  retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
148
 
@@ -1562,6 +1565,24 @@ def fetch_google_flights(departure_id="JFK", arrival_id="BHM", outbound_date=cur
1562
  # def insert_prompt(current_text, prompt):
1563
  # return prompt[0] if prompt else current_text
1564
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1565
 
1566
 
1567
 
@@ -1669,7 +1690,16 @@ with gr.Blocks(theme='gradio/soft') as demo:
1669
  # refresh_button = gr.Button("Refresh Images")
1670
  # refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
1671
 
1672
-
 
 
 
 
 
 
 
 
 
1673
 
1674
 
1675
 
 
33
  import numpy as np
34
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
35
  from langchain_huggingface import HuggingFaceEmbeddings
36
+ from langchain_community.document_loaders import PDFPlumberLoader
37
+ import pdfplumber
38
 
39
 
40
  # Neo4j imports
 
128
 
129
  # Existing embeddings and vector store for GPT-4o
130
  gpt_embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
131
+ gpt_vectorstore = PineconeVectorStore(index_name="italy-pdf", embedding=gpt_embeddings)
132
  gpt_retriever = gpt_vectorstore.as_retriever(search_kwargs={'k': 5})
133
 
134
  # New vector store setup for Phi-3.5
135
  phi_embeddings = embeddings
136
+ phi_vectorstore = PineconeVectorStore(index_name="italy-pdf", embedding=embeddings)
137
  phi_retriever = phi_vectorstore.as_retriever(search_kwargs={'k': 5})
138
 
139
 
 
144
  from pinecone import Pinecone
145
  pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
146
 
147
+ # index_name = "italyopenai"
148
+ index_name = "italy-pdf"
149
  vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
150
  retriever = vectorstore.as_retriever(search_kwargs={'k': 5})
151
 
 
1565
  # def insert_prompt(current_text, prompt):
1566
  # return prompt[0] if prompt else current_text
1567
 
1568
+ # Function to process PDF, extract text, split it into chunks, and upload to the vector DB
1569
+ def process_pdf(pdf_file):
1570
+ with pdfplumber.open(pdf_file) as pdf:
1571
+ all_text = ""
1572
+ for page in pdf.pages:
1573
+ all_text += page.extract_text()
1574
+
1575
+ # Split the text into chunks
1576
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
1577
+ chunks = text_splitter.split_text(all_text)
1578
+
1579
+ # Embed and upload the chunks into the vector database
1580
+ chunk_ids = []
1581
+ for chunk in chunks:
1582
+ chunk_id = vector_store.add_documents([chunk])
1583
+ chunk_ids.append(chunk_id)
1584
+
1585
+ return f"Uploaded {len(chunks)} chunks to the vector database."
1586
 
1587
 
1588
 
 
1690
  # refresh_button = gr.Button("Refresh Images")
1691
  # refresh_button.click(fn=update_images, inputs=None, outputs=[image_output_1, image_output_2, image_output_3])
1692
 
1693
+ # File upload component
1694
+ with gr.Column():
1695
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
1696
+ # Button to trigger processing
1697
+ process_button = gr.Button("Process PDF and Upload")
1698
+ # Output textbox for results
1699
+ output_textbox = gr.Textbox(label="Result")
1700
+
1701
+ # Define button click action
1702
+ process_button.click(fn=process_pdf, inputs=file_input, outputs=output_textbox)
1703
 
1704
 
1705