Spaces:

Emerging-Tech
/

docreader

Runtime error

File size: 6,807 Bytes

import streamlit as st
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import OpenAI, VectorDBQA
from langchain.chains import RetrievalQAWithSourcesChain
import PyPDF2

api_key = os.environ["OPENAI_API_KEY"]

#This function will go through pdf and extract and return list of page texts.
def read_and_textify(files):
    text_list = []
    sources_list = []
    for file in files:
        pdfReader = PyPDF2.PdfReader(file)
        #print("Page Number:", len(pdfReader.pages))
        for i in range(len(pdfReader.pages)):
          pageObj = pdfReader.pages[i]
          text = pageObj.extract_text()
          pageObj.clear()
          text_list.append(text)
          sources_list.append(file.name + "_page_"+str(i))
    return [text_list,sources_list]
  
st.set_page_config(layout="centered", page_title="Multidoc_QnA")
st.header("Multidoc_QnA")
st.write("---")
  
#file uploader
uploaded_files = st.file_uploader("Upload documents",accept_multiple_files=True, type=["txt","pdf"])
st.write("---")

if uploaded_files is None:
  st.info(f"""Upload files to analyse""")
elif uploaded_files:
  st.write(str(len(uploaded_files)) + " document(s) loaded..")
  
  textify_output = read_and_textify(uploaded_files)
  
  documents = textify_output[0]
  sources = textify_output[1]
  
  #extract embeddings
  embeddings = OpenAIEmbeddings(openai_api_key = api_key)
  #vstore with metadata. Here we will store page numbers.
  vStore = Chroma.from_texts(documents, embeddings, metadatas=[{"source": s} for s in sources])
  #deciding model
  model_name = "gpt-3.5-turbo"
  # model_name = "gpt-4"

  retriever = vStore.as_retriever()
  retriever.search_kwargs = {'k':2}

  #initiate model
  llm = OpenAI(model_name=model_name, openai_api_key = api_key, streaming=True)
  model = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
  
  st.header("Ask your data")
  user_q = st.text_area("Enter your questions here")
  
  if st.button("Get Response"):
    try:
      with st.spinner("Model is working on it..."):
        result = model({"question":user_q}, return_only_outputs=True)
        st.subheader('Your response:')
        st.write(result['answer'])
        st.subheader('Source pages:')
        st.write(result['sources'])
    except Exception as e:
      st.error(f"An error occurred: {e}")
      st.error('Oops, the GPT response resulted in an error :( Please try again with a different question.')
      
        
    
  
  
  
  
  
  
  
  
  
























# import gradio as gr
# import streamlit as st
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.vectorstores import Chroma
# from langchain.chains import ConversationalRetrievalChain
# from langchain.chat_models import ChatOpenAI
# from langchain.document_loaders import PyPDFLoader
# import os
# import fitz
# from PIL import Image


# # Global variables
# COUNT, N = 0, 0
# chat_history = []
# chain = None  # Initialize chain as None

# # Function to set the OpenAI API key

# api_key = os.environ['OPENAI_API_KEY']

# st.write(api_key)

    
# # Function to enable the API key input box
# def enable_api_box():
#     return enable_box

# # Function to add text to the chat history
# def add_text(history, text):
#     if not text:
#         raise gr.Error('Enter text')
#     history = history + [(text, '')]
#     return history

# # Function to process the PDF file and create a conversation chain
# def process_file(file):
#     global chain
#     if 'OPENAI_API_KEY' not in os.environ:
#         raise gr.Error('Upload your OpenAI API key')

#     # Replace with your actual PDF processing logic
#     loader = PyPDFLoader(file.name)
#     documents = loader.load()
#     embeddings = OpenAIEmbeddings()
#     pdfsearch = Chroma.from_documents(documents, embeddings)

#     chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0.3),
#                                     retriever=pdfsearch.as_retriever(search_kwargs={"k": 1}),
#                                     return_source_documents=True)
#     return chain

# # Function to generate a response based on the chat history and query
# def generate_response(history, query, pdf_upload):
#     global COUNT, N, chat_history, chain
#     if not pdf_upload:
#         raise gr.Error(message='Upload a PDF')

#     if COUNT == 0:
#         chain = process_file(pdf_upload)
#         COUNT += 1

#     # Replace with your LangChain logic to generate a response 
#     result = chain({"question": query, 'chat_history': chat_history}, return_only_outputs=True)  
#     chat_history += [(query, result["answer"])]
#     N = list(result['source_documents'][0])[1][1]['page']  # Adjust as needed

#     for char in result['answer']:
#         history[-1][-1] += char  
#     return history, ''  

# # Function to render a specific page of a PDF file as an image
# def render_file(file):
#     global N
#     doc = fitz.open(file.name)
#     page = doc[N]
#     pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72)) 
#     image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
#     return image

# # Function to render initial content from the PDF
# def render_first(pdf_file): 
#     # Replace with logic to process the PDF and generate an initial image
#     image = Image.new('RGB', (600, 400), color = 'white') # Placeholder
#     return image

# # Streamlit & Gradio Interface

# st.title("PDF-Powered Chatbot") 

# with st.container():      
#   gr.Markdown("""     
#   <style>       
#   .image-container { height: 680px; }     
#   </style>     
#   """)    

# with gr.Blocks() as demo:
#     pdf_upload1 = gr.UploadButton("📁 Upload PDF 1", file_types=[".pdf"])  # Define pdf_upload1

#     # ... (rest of your interface creation)

#     txt = gr.Textbox(label="Enter your query", placeholder="Ask a question...")     
#     submit_btn = gr.Button('Submit')

#     @submit_btn.click()
#     def on_submit():
#       add_text(chatbot, txt)
#       generate_response(chatbot, txt, pdf_upload1)  # Use pdf_upload1 here
#       render_file(pdf_upload1)  # Use pdf_upload1 here

# if __name__ == "__main__":
#     gr.Interface(         
#         fn=generate_response,
#         inputs=[
#             "file",  # Define pdf_upload1
#             "text",  # Define chatbot output
#             "text"   # Define txt
#         ],
#         outputs=[
#             "image",  # Define show_img
#             "text",   # Define chatbot output
#             "text"    # Define txt
#         ],   
#         title="PDF-Powered Chatbot"     
#     ).launch()