|
import os |
|
import streamlit as st |
|
from PyPDF2 import PdfReader |
|
from sentence_transformers import SentenceTransformer |
|
import faiss |
|
import numpy as np |
|
from groq import Groq |
|
|
|
GROQ_API_KEY = "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" |
|
client = Groq(api_key=GROQ_API_KEY) |
|
|
|
|
|
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
dimension = 384 |
|
index = faiss.IndexFlatL2(dimension) |
|
|
|
|
|
def extract_text_from_pdf(file): |
|
"""Extract text from a PDF file.""" |
|
reader = PdfReader(file) |
|
text = "" |
|
for page in reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
def create_chunks(text, chunk_size=500): |
|
"""Split text into chunks of a specified size.""" |
|
words = text.split() |
|
return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] |
|
|
|
def generate_embeddings(chunks): |
|
"""Generate embeddings for the given chunks.""" |
|
return embedding_model.encode(chunks, convert_to_numpy=True) |
|
|
|
def query_groq(prompt): |
|
"""Query the Groq model for a response.""" |
|
response = client.chat.completions.create( |
|
messages=[{"role": "user", "content": prompt}], |
|
model="llama3-8b-8192" |
|
) |
|
return response.choices[0].message.content |
|
|
|
|
|
st.title("RAG-Based Document Query Application") |
|
st.write("Upload a PDF, ask questions, and get AI-powered answers!") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"]) |
|
if uploaded_file: |
|
with st.spinner("Processing the PDF..."): |
|
|
|
pdf_text = extract_text_from_pdf(uploaded_file) |
|
st.success("PDF processed successfully!") |
|
|
|
|
|
chunks = create_chunks(pdf_text) |
|
st.info(f"Document split into {len(chunks)} chunks.") |
|
|
|
|
|
embeddings = generate_embeddings(chunks) |
|
index.add(np.array(embeddings)) |
|
st.success("Embeddings stored in FAISS database.") |
|
|
|
|
|
user_query = st.text_input("Ask a question:") |
|
if user_query: |
|
with st.spinner("Searching and generating a response..."): |
|
|
|
query_embedding = embedding_model.encode([user_query], convert_to_numpy=True) |
|
|
|
|
|
distances, indices = index.search(np.array(query_embedding), k=5) |
|
relevant_chunks = [chunks[i] for i in indices[0]] |
|
|
|
|
|
context = " ".join(relevant_chunks) |
|
|
|
|
|
prompt = f"Context: {context}\n\nQuestion: {user_query}\n\nAnswer:" |
|
answer = query_groq(prompt) |
|
|
|
|
|
st.write("### Answer:") |
|
st.write(answer) |
|
|