Spaces:
Sleeping
Sleeping
File size: 3,690 Bytes
e868234 6535ee7 e868234 2fb99d1 e868234 6535ee7 4d506bf 6535ee7 2fb99d1 93c6cd0 2fb99d1 e868234 2fb99d1 e868234 2fb99d1 e868234 2fb99d1 e868234 2fb99d1 e868234 2fb99d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import os
import streamlit as st
import PyPDF2
import torch
from transformers import AutoTokenizer, AutoModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationChain
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings
# Set the OpenAI API key directly (or ensure it's set in the environment)
os.environ["OPENAI_API_KEY"] = "api_key"
# Set up the title and LinkedIn link
st.title("")
st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
st.title("PDF Query Chatbot")
# Load the pre-trained model and tokenizer
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')
return tokenizer, model
tokenizer, model = load_model()
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text()
return text
def chunkize_text(text, chunk_size=1000, chunk_overlap=200):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_text(text)
return chunks
def get_embeddings(texts):
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
# Sidebar for file upload and link input
st.sidebar.title("Load PDF")
pdf_url = st.sidebar.text_input("Paste PDF link here:")
uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True)
submit_button = st.sidebar.button("Submit")
# Initialize an empty dictionary for storing processed PDFs
pdf_chunks_embeddings = {}
if submit_button:
if pdf_url:
try:
response = requests.get(pdf_url)
response.raise_for_status()
pdf_file = BytesIO(response.content)
st.write(f"Processing document from URL: {pdf_url}")
text = extract_text_from_pdf(pdf_file)
chunks = chunkize_text(text)
embeddings = get_embeddings(chunks)
pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings}
st.success("PDF processed successfully!")
except requests.exceptions.RequestException as e:
st.error(f"Error loading PDF from URL: {e}")
if uploaded_files:
for uploaded_file in uploaded_files:
pdf_name = uploaded_file.name
st.write(f"Processing `{pdf_name}`...")
text = extract_text_from_pdf(uploaded_file)
chunks = chunkize_text(text)
embeddings = get_embeddings(chunks)
pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings}
st.success("PDF(s) processed successfully!")
# Chatbot section for querying the PDF content
st.write("### PDF Query Chatbot")
if pdf_chunks_embeddings:
chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings())
query = st.text_input("Enter your query here:")
if query:
# Generate a response from the chatbot based on the processed PDFs
for pdf_name, data in pdf_chunks_embeddings.items():
chatbot.add_documents(data['chunks'])
response = chatbot.run(query)
st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}")
else:
st.write("No PDFs processed yet. Please submit a PDF to get started.") |