import os import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import faiss import numpy as np from groq import Groq GROQ_API_KEY = "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" client = Groq(api_key=GROQ_API_KEY) # Initialize embedding model embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # Open-source lightweight model # Initialize FAISS index dimension = 384 # Embedding size of the model index = faiss.IndexFlatL2(dimension) # Helper Functions def extract_text_from_pdf(file): """Extract text from a PDF file.""" reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text def create_chunks(text, chunk_size=500): """Split text into chunks of a specified size.""" words = text.split() return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] def generate_embeddings(chunks): """Generate embeddings for the given chunks.""" return embedding_model.encode(chunks, convert_to_numpy=True) def query_groq(prompt): """Query the Groq model for a response.""" response = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-8b-8192" ) return response.choices[0].message.content # Streamlit App st.title("RAG-Based Document Query Application") st.write("Upload a PDF, ask questions, and get AI-powered answers!") # Step 1: PDF Upload uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"]) if uploaded_file: with st.spinner("Processing the PDF..."): # Extract text pdf_text = extract_text_from_pdf(uploaded_file) st.success("PDF processed successfully!") # Step 2: Chunking chunks = create_chunks(pdf_text) st.info(f"Document split into {len(chunks)} chunks.") # Step 3: Embedding Creation embeddings = generate_embeddings(chunks) index.add(np.array(embeddings)) st.success("Embeddings stored in FAISS database.") # Step 4: Query user_query = st.text_input("Ask a question:") if user_query: with st.spinner("Searching and generating a response..."): # Embed user query query_embedding = embedding_model.encode([user_query], convert_to_numpy=True) # Search in FAISS distances, indices = index.search(np.array(query_embedding), k=5) relevant_chunks = [chunks[i] for i in indices[0]] # Combine retrieved chunks as context context = " ".join(relevant_chunks) # Query Groq model prompt = f"Context: {context}\n\nQuestion: {user_query}\n\nAnswer:" answer = query_groq(prompt) # Display response st.write("### Answer:") st.write(answer)