import os import streamlit as st import PyPDF2 import torch from transformers import AutoTokenizer, AutoModel from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains import ConversationChain from langchain.llms import OpenAI from langchain.embeddings import HuggingFaceEmbeddings # Set the OpenAI API key directly (or ensure it's set in the environment) os.environ["OPENAI_API_KEY"] = "api_key" # Set up the title and LinkedIn link st.title("") st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)") st.title("PDF Query Chatbot") # Load the pre-trained model and tokenizer @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') model = AutoModel.from_pretrained('distilbert-base-uncased') return tokenizer, model tokenizer, model = load_model() def extract_text_from_pdf(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = '' for page in range(len(reader.pages)): text += reader.pages[page].extract_text() return text def chunkize_text(text, chunk_size=1000, chunk_overlap=200): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) chunks = text_splitter.split_text(text) return chunks def get_embeddings(texts): inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings # Sidebar for file upload and link input st.sidebar.title("Load PDF") pdf_url = st.sidebar.text_input("Paste PDF link here:") uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True) submit_button = st.sidebar.button("Submit") # Initialize an empty dictionary for storing processed PDFs pdf_chunks_embeddings = {} if submit_button: if pdf_url: try: response = requests.get(pdf_url) response.raise_for_status() pdf_file = BytesIO(response.content) st.write(f"Processing document from URL: {pdf_url}") text = extract_text_from_pdf(pdf_file) chunks = chunkize_text(text) embeddings = get_embeddings(chunks) pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings} st.success("PDF processed successfully!") except requests.exceptions.RequestException as e: st.error(f"Error loading PDF from URL: {e}") if uploaded_files: for uploaded_file in uploaded_files: pdf_name = uploaded_file.name st.write(f"Processing `{pdf_name}`...") text = extract_text_from_pdf(uploaded_file) chunks = chunkize_text(text) embeddings = get_embeddings(chunks) pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings} st.success("PDF(s) processed successfully!") # Chatbot section for querying the PDF content st.write("### PDF Query Chatbot") if pdf_chunks_embeddings: chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings()) query = st.text_input("Enter your query here:") if query: # Generate a response from the chatbot based on the processed PDFs for pdf_name, data in pdf_chunks_embeddings.items(): chatbot.add_documents(data['chunks']) response = chatbot.run(query) st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}") else: st.write("No PDFs processed yet. Please submit a PDF to get started.")