import streamlit as st from PyPDF2 import PdfReader from langchain.vectorstores.cassandra import Cassandra from langchain.indexes.vectorstore import VectorStoreIndexWrapper from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFaceHub from langchain.text_splitter import CharacterTextSplitter import cassio from dotenv import load_dotenv import os load_dotenv() ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # === Streamlit UI Setup === st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide") st.title("📄💬 Query PDF using LangChain + AstraDB (Free Hugging Face Models)") # === File Upload === uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) if uploaded_file: st.success("✅ PDF uploaded successfully!") process_button = st.button("🔄 Process PDF") if process_button: # Initialize AstraDB cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) # Read PDF contents pdf_reader = PdfReader(uploaded_file) raw_text = "" for page in pdf_reader.pages: content = page.extract_text() if content: raw_text += content # Split text into chunks text_splitter = CharacterTextSplitter( separator="\n", chunk_size=800, chunk_overlap=200, length_function=len ) texts = text_splitter.split_text(raw_text) # === Embeddings === embedding = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # === Hugging Face LLM === llm = HuggingFaceHub( repo_id="mistralai/Mistral-7B-Instruct-v0.1", huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, model_kwargs={"temperature": 0.5, "max_new_tokens": 512} ) # === Create vector store and index === vector_store = Cassandra( embedding=embedding, table_name=TABLE_NAME, session=None, keyspace=None, ) vector_store.add_texts(texts[:50]) st.success(f"📚 {len(texts[:50])} chunks embedded and stored in AstraDB.") astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) # === Ask Questions === st.header("🤖 Ask a question about your PDF") user_question = st.text_input("💬 Type your question here") if user_question: with st.spinner("Thinking..."): answer = astra_vector_index.query(user_question, llm=llm).strip() st.markdown(f"### 🧠 Answer:\n{answer}") st.markdown("### 🔍 Top Relevant Chunks") docs = vector_store.similarity_search_with_score(user_question, k=4) for i, (doc, score) in enumerate(docs, 1): st.markdown(f"**Chunk {i}** — Relevance Score: `{score:.4f}`") st.code(doc.page_content[:500], language="markdown")