kpawargi commited on
Commit
2bc5b24
Β·
verified Β·
1 Parent(s): a310bdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -41
app.py CHANGED
@@ -6,31 +6,32 @@ from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.llms import HuggingFaceHub
7
  from langchain.text_splitter import CharacterTextSplitter
8
  import cassio
9
- from dotenv import load_dotenv
10
  import os
 
 
11
 
 
12
  load_dotenv()
13
-
14
  ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
15
  ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
16
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
17
 
18
- # === Streamlit UI Setup ===
19
- st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide")
20
- st.title("πŸ“„πŸ’¬ Query PDF using LangChain + AstraDB (Free Hugging Face Models)")
21
 
22
- # === File Upload ===
 
 
 
 
23
  uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
24
 
25
  if uploaded_file:
26
- st.success("βœ… PDF uploaded successfully!")
27
  process_button = st.button("πŸ”„ Process PDF")
28
 
29
  if process_button:
30
- # Initialize AstraDB
31
- cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
32
-
33
- # Read PDF contents
34
  pdf_reader = PdfReader(uploaded_file)
35
  raw_text = ""
36
  for page in pdf_reader.pages:
@@ -38,59 +39,56 @@ if uploaded_file:
38
  if content:
39
  raw_text += content
40
 
41
- # Split text into chunks
42
  text_splitter = CharacterTextSplitter(
43
  separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
44
  )
45
  texts = text_splitter.split_text(raw_text)
46
 
47
- # === Embeddings ===
48
- embedding = HuggingFaceEmbeddings(
49
- model_name="sentence-transformers/all-MiniLM-L6-v2"
50
- )
51
 
52
- # === Hugging Face LLM ===
53
  llm = HuggingFaceHub(
54
  repo_id="mistralai/Mistral-7B-Instruct-v0.1",
55
  huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
56
  model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
57
  )
58
 
59
- # === Create vector store and index ===
 
 
 
60
  vector_store = Cassandra(
61
  embedding=embedding,
62
- table_name="qa_mini_demo",
63
  session=None,
64
  keyspace=None,
65
  )
 
66
  vector_store.add_texts(texts[:50])
67
  st.success(f"πŸ“š {len(texts[:50])} chunks embedded and stored in AstraDB.")
68
 
 
69
  astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
70
 
71
- # === Ask Questions ===
72
  st.header("πŸ€– Ask a question about your PDF")
73
  user_question = st.text_input("πŸ’¬ Type your question here")
74
 
75
  if user_question:
76
- with st.spinner("🧠 Thinking..."):
77
- try:
78
- # Optional: show what documents are retrieved before sending to LLM
79
- retrieved_docs = vector_store.similarity_search(user_question, k=4)
80
- if not retrieved_docs:
81
- st.warning("⚠️ No relevant text chunks found for this question. Try a different question.")
82
- else:
83
- st.markdown("### πŸ” Top Relevant Chunks (raw):")
84
- for i, doc in enumerate(retrieved_docs, 1):
85
- st.code(doc.page_content[:300], language="markdown")
86
-
87
-
88
- answer = astra_vector_index.query(user_question, llm=llm)
89
- if answer.strip():
90
- st.markdown("### 🧠 Answer:")
91
- st.write(answer.strip())
92
- else:
93
- st.warning("⚠️ The model returned an empty response. Try rephrasing the question or check your model/API key.")
94
- except Exception as e:
95
- st.error(f"🚨 Error while generating response:\n\n{str(e)}")
96
-
 
6
  from langchain.llms import HuggingFaceHub
7
  from langchain.text_splitter import CharacterTextSplitter
8
  import cassio
 
9
  import os
10
+ import uuid
11
+ from dotenv import load_dotenv
12
 
13
+ # πŸ” Load secrets from environment (Hugging Face Spaces uses HF Secrets)
14
  load_dotenv()
 
15
  ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
16
  ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
17
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
18
 
19
+ # 🧠 Initialize AstraDB
20
+ cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
 
21
 
22
+ # 🎨 Streamlit UI Setup
23
+ st.set_page_config(page_title="Query PDF with LangChain", layout="wide")
24
+ st.title("πŸ“„πŸ’¬ Query PDF using LangChain + AstraDB (Hugging Face Models)")
25
+
26
+ # πŸ“ PDF Upload
27
  uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
28
 
29
  if uploaded_file:
30
+ st.success("βœ… PDF uploaded successfully.")
31
  process_button = st.button("πŸ”„ Process PDF")
32
 
33
  if process_button:
34
+ # 🧾 Read PDF
 
 
 
35
  pdf_reader = PdfReader(uploaded_file)
36
  raw_text = ""
37
  for page in pdf_reader.pages:
 
39
  if content:
40
  raw_text += content
41
 
42
+ # βœ‚οΈ Split into Chunks
43
  text_splitter = CharacterTextSplitter(
44
  separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
45
  )
46
  texts = text_splitter.split_text(raw_text)
47
 
48
+ # 🧠 Embeddings
49
+ embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2")
 
 
50
 
51
+ # πŸ€– LLM
52
  llm = HuggingFaceHub(
53
  repo_id="mistralai/Mistral-7B-Instruct-v0.1",
54
  huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
55
  model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
56
  )
57
 
58
+ # πŸ—ƒοΈ Unique Table Name for Each PDF Upload
59
+ table_name = "qa_" + str(uuid.uuid4()).replace("-", "_")
60
+
61
+ # πŸ“¦ Vector Store Setup
62
  vector_store = Cassandra(
63
  embedding=embedding,
64
+ table_name=table_name,
65
  session=None,
66
  keyspace=None,
67
  )
68
+
69
  vector_store.add_texts(texts[:50])
70
  st.success(f"πŸ“š {len(texts[:50])} chunks embedded and stored in AstraDB.")
71
 
72
+ # πŸ” Setup Index
73
  astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
74
 
75
+ # πŸ’¬ Ask Questions
76
  st.header("πŸ€– Ask a question about your PDF")
77
  user_question = st.text_input("πŸ’¬ Type your question here")
78
 
79
  if user_question:
80
+ with st.spinner("🧠 Thinking..."):
81
+ try:
82
+ # Retrieve relevant context (used internally, not displayed)
83
+ retrieved_docs = vector_store.similarity_search(user_question, k=8)
84
+ if not retrieved_docs:
85
+ st.warning("⚠️ No relevant text found. Try rephrasing your question.")
86
+ else:
87
+ answer = astra_vector_index.query(user_question, llm=llm)
88
+ if answer.strip():
89
+ st.markdown("### 🧠 Answer:")
90
+ st.write(answer.strip())
91
+ else:
92
+ st.warning("⚠️ Model returned an empty response.")
93
+ except Exception as e:
94
+ st.error(f"🚨 Error: {str(e)}")