aaporosh commited on
Commit
f513b53
·
verified ·
1 Parent(s): c9bb3d9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import logging
4
+ from io import BytesIO
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain.chains.question_answering import load_qa_chain
11
+ from langchain_community.llms import HuggingFaceHub
12
+ from transformers import pipeline # For fallback if Hub fails
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Check API token
19
+ if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
20
+ st.error("HUGGINGFACEHUB_API_TOKEN not set in secrets. Add it in Space settings.")
21
+ st.stop()
22
+
23
+ try:
24
+ # Function to process PDF
25
+ def process_pdf(uploaded_file):
26
+ try:
27
+ logger.info("Starting PDF processing")
28
+ pdf_reader = PdfReader(BytesIO(uploaded_file.getvalue()))
29
+ text = ""
30
+ for page in pdf_reader.pages:
31
+ extracted = page.extract_text()
32
+ if extracted:
33
+ text += extracted + "\n"
34
+
35
+ if not text:
36
+ raise ValueError("No text extracted from PDF.")
37
+
38
+ # Chunk text (increased overlap for better context)
39
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
40
+ chunks = text_splitter.split_text(text)
41
+
42
+ # Embeddings (light model)
43
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
44
+
45
+ # Vector store
46
+ vector_store = FAISS.from_texts(chunks, embedding=embeddings)
47
+ logger.info("PDF processed successfully")
48
+ return vector_store
49
+ except Exception as e:
50
+ logger.error(f"PDF processing error: {str(e)}")
51
+ st.error(f"Error processing PDF: {str(e)}")
52
+ return None
53
+
54
+ # Function to answer questions
55
+ def answer_question(vector_store, query):
56
+ try:
57
+ logger.info(f"Answering query: {query}")
58
+ # Lighter LLM via pipeline for faster CPU inference
59
+ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
60
+
61
+ # Retrieve top chunks
62
+ docs = vector_store.similarity_search(query, k=3)
63
+ context = "\n".join([doc.page_content for doc in docs])
64
+
65
+ # Prompt
66
+ prompt = f"Use this context to answer concisely: {context}\nQuestion: {query}\nAnswer:"
67
+ response = qa_pipeline(prompt, max_length=256, num_return_sequences=1)[0]['generated_text']
68
+
69
+ logger.info("Answer generated")
70
+ return response.strip()
71
+ except Exception as e:
72
+ logger.error(f"Answer generation error: {str(e)}")
73
+ st.error(f"Error answering: {str(e)}")
74
+ return "Unable to generate answer."
75
+
76
+ # Streamlit UI with chat history
77
+ st.title("Smart PDF Q&A")
78
+ st.write("Upload a PDF and ask questions! Chat history is preserved.")
79
+
80
+ # Initialize session state
81
+ if "messages" not in st.session_state:
82
+ st.session_state.messages = []
83
+ if "vector_store" not in st.session_state:
84
+ st.session_state.vector_store = None
85
+
86
+ # PDF upload and process
87
+ uploaded_file = st.file_uploader("Upload PDF", type="pdf")
88
+ if uploaded_file:
89
+ if st.button("Process PDF"):
90
+ with st.spinner("Processing..."):
91
+ vector_store = process_pdf(uploaded_file)
92
+ if vector_store:
93
+ st.session_state.vector_store = vector_store
94
+ st.success("PDF ready! Ask away.")
95
+ st.session_state.messages = [] # Reset chat on new PDF
96
+
97
+ # Display chat history
98
+ for message in st.session_state.messages:
99
+ with st.chat_message(message["role"]):
100
+ st.markdown(message["content"])
101
+
102
+ # Question input
103
+ if st.session_state.vector_store:
104
+ if prompt := st.chat_input("Ask a question:"):
105
+ # Add user message
106
+ st.session_state.messages.append({"role": "user", "content": prompt})
107
+ with st.chat_message("user"):
108
+ st.markdown(prompt)
109
+
110
+ # Generate answer
111
+ with st.chat_message("assistant"):
112
+ with st.spinner("Thinking..."):
113
+ answer = answer_question(st.session_state.vector_store, prompt)
114
+ st.markdown(answer)
115
+ st.session_state.messages.append({"role": "assistant", "content": answer})
116
+
117
+ except Exception as e:
118
+ logger.error(f"App initialization failed: {str(e)}")
119
+ st.error(f"Initialization error: {str(e)}. Check logs or try factory reset.")