Spaces:
Sleeping
Sleeping
import os | |
import faiss | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
from sentence_transformers import SentenceTransformer | |
from groq import Groq | |
from dotenv import load_dotenv | |
import requests | |
from io import BytesIO | |
# Predefined Google Drive links | |
PDF_LINKS = [ | |
"https://drive.google.com/uc?id=1JPf0XvDhn8QoDOlZDrxCOpu4WzKFESNz", | |
# Add more Google Drive links here | |
] | |
# Initialize Groq client | |
client = Groq(api_key="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") | |
# Load Sentence Transformer model | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
# Initialize FAISS | |
dimension = 384 # Embedding size for the Sentence Transformer model | |
index = faiss.IndexFlatL2(dimension) | |
# Store chunks globally | |
stored_chunks = [] | |
# Function to download and extract the PDF content | |
def download_and_process_pdf(link): | |
response = requests.get(link) | |
if response.status_code == 200: | |
pdf_reader = PdfReader(BytesIO(response.content)) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
chunks = [text[i:i + 500] for i in range(0, len(text), 500)] # Chunk into 500-char blocks | |
embeddings = model.encode(chunks) | |
index.add(embeddings) | |
stored_chunks.extend(chunks) | |
else: | |
print(f"Failed to download PDF from link: {link}") | |
# Process all predefined links | |
for link in PDF_LINKS: | |
download_and_process_pdf(link) | |
# Function to query FAISS and generate a response | |
def query_model(query): | |
query_vector = model.encode([query]) | |
_, indices = index.search(query_vector, k=3) # Top 3 similar chunks | |
response_chunks = [stored_chunks[idx] for idx in indices[0]] | |
context = " ".join(response_chunks) | |
# Groq API call | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"Context: {context}\n\nQuery: {query}", | |
} | |
], | |
model="llama3-8b-8192", | |
) | |
return chat_completion.choices[0].message.content | |
# Streamlit app | |
st.title("RAG-based PDF Question Answering") | |
st.write("Preloaded documents from Google Drive are ready for querying.") | |
query = st.text_input("Ask a question:") | |
if query: | |
answer = query_model(query) | |
st.write("### Answer:") | |
st.write(answer) | |