File size: 3,690 Bytes
e868234
6535ee7
e868234
 
 
 
2fb99d1
 
 
e868234
6535ee7
4d506bf
6535ee7
2fb99d1
93c6cd0
2fb99d1
 
e868234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fb99d1
 
 
 
 
e868234
2fb99d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e868234
2fb99d1
 
 
 
 
 
 
 
 
e868234
2fb99d1
 
 
 
 
 
 
 
 
 
 
 
e868234
2fb99d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import streamlit as st
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationChain
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings

# Set the OpenAI API key directly (or ensure it's set in the environment)
os.environ["OPENAI_API_KEY"] = "api_key"

# Set up the title and LinkedIn link
st.title("")
st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
st.title("PDF Query Chatbot")

# Load the pre-trained model and tokenizer
@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    model = AutoModel.from_pretrained('distilbert-base-uncased')
    return tokenizer, model

tokenizer, model = load_model()

def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page in range(len(reader.pages)):
        text += reader.pages[page].extract_text()
    return text

def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Sidebar for file upload and link input
st.sidebar.title("Load PDF")
pdf_url = st.sidebar.text_input("Paste PDF link here:")
uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True)
submit_button = st.sidebar.button("Submit")

# Initialize an empty dictionary for storing processed PDFs
pdf_chunks_embeddings = {}

if submit_button:
    if pdf_url:
        try:
            response = requests.get(pdf_url)
            response.raise_for_status()
            pdf_file = BytesIO(response.content)
            st.write(f"Processing document from URL: {pdf_url}")
            text = extract_text_from_pdf(pdf_file)
            chunks = chunkize_text(text)
            embeddings = get_embeddings(chunks)
            pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings}
            st.success("PDF processed successfully!")
        except requests.exceptions.RequestException as e:
            st.error(f"Error loading PDF from URL: {e}")

    if uploaded_files:
        for uploaded_file in uploaded_files:
            pdf_name = uploaded_file.name
            st.write(f"Processing `{pdf_name}`...")
            text = extract_text_from_pdf(uploaded_file)
            chunks = chunkize_text(text)
            embeddings = get_embeddings(chunks)
            pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings}
        st.success("PDF(s) processed successfully!")

# Chatbot section for querying the PDF content
st.write("### PDF Query Chatbot")
if pdf_chunks_embeddings:
    chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings())
    
    query = st.text_input("Enter your query here:")
    if query:
        # Generate a response from the chatbot based on the processed PDFs
        for pdf_name, data in pdf_chunks_embeddings.items():
            chatbot.add_documents(data['chunks'])
            response = chatbot.run(query)
            st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}")
else:
    st.write("No PDFs processed yet. Please submit a PDF to get started.")