import streamlit as st from transformers import pipeline import pdfplumber import logging import pandas as pd import docx import pickle import os from hashlib import sha256 # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Initialize QA pipeline with a pre-trained RoBERTa QA model @st.cache_resource def init_qa_model(): try: logger.info("Initializing QA model...") qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2") logger.info("QA model loaded successfully.") return qa_pipeline except Exception as e: logger.error(f"Error loading QA model: {e}") st.error(f"Error loading the QA model: {e}") return None # Function to extract text from PDF def extract_text_from_pdf(pdf_file): try: with pdfplumber.open(pdf_file) as pdf: text = '' for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text return text or "No text found in the PDF." except Exception as e: logger.error(f"Error extracting text from PDF: {e}") return "Error extracting text from PDF." # Function to extract text from TXT files def extract_text_from_txt(txt_file): try: return txt_file.getvalue().decode("utf-8") or "No text found in the TXT file." except Exception as e: logger.error(f"Error extracting text from TXT file: {e}") return "Error extracting text from TXT file." # Function to extract text from CSV files def extract_text_from_csv(csv_file): try: df = pd.read_csv(csv_file) return df.to_string(index=False) or "No text found in the CSV file." except Exception as e: logger.error(f"Error extracting text from CSV file: {e}") return "Error extracting text from CSV file." # Function to extract text from DOCX files def extract_text_from_docx(docx_file): try: doc = docx.Document(docx_file) return "\n".join([para.text for para in doc.paragraphs]) or "No text found in the DOCX file." except Exception as e: logger.error(f"Error extracting text from DOCX file: {e}") return "Error extracting text from DOCX file." # Function to create a unique cache key for the document def generate_cache_key(text): return sha256(text.encode('utf-8')).hexdigest() # Function to cache embeddings def cache_embeddings(embeddings, cache_key): try: cache_path = f"embeddings_cache/{cache_key}.pkl" if not os.path.exists('../embeddings_cache'): os.makedirs('../embeddings_cache') with open(cache_path, 'wb') as f: pickle.dump(embeddings, f) logger.info(f"Embeddings cached successfully with key {cache_key}") except Exception as e: logger.error(f"Error caching embeddings: {e}") # Function to load cached embeddings def load_cached_embeddings(cache_key): try: cache_path = f"embeddings_cache/{cache_key}.pkl" if os.path.exists(cache_path): with open(cache_path, 'rb') as f: embeddings = pickle.load(f) logger.info(f"Embeddings loaded from cache with key {cache_key}") return embeddings return None except Exception as e: logger.error(f"Error loading cached embeddings: {e}") return None # Main function for the app def main(): st.title("Adnan AI Labs QA System") st.markdown("Upload documents (PDF, TXT, CSV, or DOCX) or add context manually, and ask questions.") uploaded_files = st.file_uploader("Upload Documents", type=["pdf", "txt", "csv", "docx"], accept_multiple_files=True) extracted_text_box = st.text_area("Manually add extra context for answering questions", height=200) # Initialize QA model qa_pipeline = init_qa_model() document_texts = [] # Extract text from each uploaded file if uploaded_files: for uploaded_file in uploaded_files: if uploaded_file.type == "application/pdf": document_texts.append(extract_text_from_pdf(uploaded_file)) elif uploaded_file.type == "text/plain": document_texts.append(extract_text_from_txt(uploaded_file)) elif uploaded_file.type in ["application/vnd.ms-excel", "text/csv"]: document_texts.append(extract_text_from_csv(uploaded_file)) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": document_texts.append(extract_text_from_docx(uploaded_file)) # Combine all extracted texts and manual context combined_context = "\n".join(document_texts) + "\n" + extracted_text_box # Check if any content is available to answer questions user_question = st.text_input("Ask a question:") if user_question and combined_context.strip(): if st.button("Get Answer"): with st.spinner('Processing your question...'): # Generate a unique cache key for the combined context cache_key = generate_cache_key(combined_context) # Check for cached embeddings cached_embeddings = load_cached_embeddings(cache_key) if cached_embeddings is None: # Process document embeddings if not cached logger.info("Generating new embeddings...") # embeddings = model.encode(combined_context) cache_embeddings(cached_embeddings, cache_key) # Cache the embeddings # Use the QA pipeline to answer the question answer = qa_pipeline(question=user_question, context=combined_context) if answer['answer']: st.write("Answer:", answer['answer']) else: st.warning("No suitable answer found. Please rephrase your question.") else: if not user_question: st.info("Please enter a question to get an answer.") elif not combined_context.strip(): st.info("Please upload a document or add context manually.") # Display Buy Me a Coffee button st.markdown("""

If you find this project useful, consider buying me a coffee to support further development! ☕️

Buy Me a Coffee
""", unsafe_allow_html=True) if __name__ == "__main__": try: main() except Exception as e: logger.critical(f"Critical error: {e}") st.error(f"A critical error occurred: {e}")