import os import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from groq import Groq # Set up Groq client client = Groq( api_key="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" ) # Streamlit app st.title("RAG-based PDF QA Application") # Step 1: Upload PDF document uploaded_file = st.file_uploader("Upload a PDF document", type="pdf") if uploaded_file: # Step 2: Extract text from PDF try: pdf_reader = PdfReader(uploaded_file) text = "\n".join( page.extract_text() for page in pdf_reader.pages if page.extract_text() ) except Exception as e: st.error(f"Failed to read PDF: {e}") text = "" if text: # Step 3: Split text into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) chunks = text_splitter.split_text(text) # Step 4: Generate embeddings st.text("Generating embeddings...") try: embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_db = FAISS.from_texts(chunks, embeddings) st.success("Embeddings generated and stored in vector database.") except Exception as e: st.error(f"Error generating embeddings: {e}") # Step 5: User interaction query = st.text_input("Ask a question based on the uploaded document:") if query: try: # Retrieve relevant chunks from vector DB docs = vector_db.similarity_search(query, k=3) context = "\n".join(doc.page_content for doc in docs) # Use Groq API for response generation chat_completion = client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": query}, {"role": "assistant", "content": context}, ], model="llama3-8b-8192", stream=False, ) answer = chat_completion.choices[0].message.content st.text_area("Answer:", value=answer, height=200) except Exception as e: st.error(f"Error processing query: {e}") # Footer st.caption("Powered by Open Source Models and Groq API.")