import streamlit as st from PyPDF2 import PdfReader from langchain.vectorstores.cassandra import Cassandra from langchain.indexes.vectorstore import VectorStoreIndexWrapper from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFaceHub from langchain.text_splitter import CharacterTextSplitter import cassio import os import uuid from dotenv import load_dotenv # ๐Ÿ” Load secrets from environment (Hugging Face Spaces uses HF Secrets) load_dotenv() ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # ๐Ÿง  Initialize AstraDB cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) # ๐ŸŽจ Streamlit UI Setup st.set_page_config(page_title="Query PDF with LangChain", layout="wide") st.title("๐Ÿ“„๐Ÿ’ฌ Query PDF using LangChain + AstraDB (Hugging Face Models)") # ๐Ÿ“ PDF Upload uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) if uploaded_file: st.success("โœ… PDF uploaded successfully.") process_button = st.button("๐Ÿ”„ Process PDF") if process_button: # ๐Ÿงพ Read PDF pdf_reader = PdfReader(uploaded_file) raw_text = "" for page in pdf_reader.pages: content = page.extract_text() if content: raw_text += content # โœ‚๏ธ Split into Chunks text_splitter = CharacterTextSplitter( separator="\n", chunk_size=800, chunk_overlap=200, length_function=len ) texts = text_splitter.split_text(raw_text) # ๐Ÿง  Embeddings embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2") # ๐Ÿค– LLM llm = HuggingFaceHub( repo_id="mistralai/Mistral-7B-Instruct-v0.1", huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, model_kwargs={"temperature": 0.5, "max_new_tokens": 512} ) # ๐Ÿ—ƒ๏ธ Unique Table Name for Each PDF Upload table_name = "qa_" + str(uuid.uuid4()).replace("-", "_") # ๐Ÿ“ฆ Vector Store Setup vector_store = Cassandra( embedding=embedding, table_name=table_name, session=None, keyspace=None, ) vector_store.add_texts(texts[:50]) st.success(f"๐Ÿ“š {len(texts[:50])} chunks embedded and stored in AstraDB.") # ๐Ÿ” Setup Index astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) # ๐Ÿ’ฌ Ask Questions st.header("๐Ÿค– Ask a question about your PDF") user_question = st.text_input("๐Ÿ’ฌ Type your question here") if user_question: with st.spinner("๐Ÿง  Thinking..."): try: # Retrieve relevant context (used internally, not displayed) retrieved_docs = vector_store.similarity_search(user_question, k=8) if not retrieved_docs: st.warning("โš ๏ธ No relevant text found. Try rephrasing your question.") else: answer = astra_vector_index.query(user_question, llm=llm) if answer.strip(): st.markdown("### ๐Ÿง  Answer:") st.write(answer.strip()) else: st.warning("โš ๏ธ Model returned an empty response.") except Exception as e: st.error(f"๐Ÿšจ Error: {str(e)}")