datascientist22 commited on
Commit
2fb99d1
·
verified ·
1 Parent(s): 47b3b73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -34
app.py CHANGED
@@ -4,10 +4,14 @@ import PyPDF2
4
  import torch
5
  from transformers import AutoTokenizer, AutoModel
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
7
 
8
- # Set up the title
9
- st.title("Engr. Hamesh Raj's PDF Chunking & Embedding Viewer")
10
- st.markdown("[LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")
 
11
 
12
  # Load the pre-trained model and tokenizer
13
  @st.cache_resource
@@ -40,38 +44,51 @@ def get_embeddings(texts):
40
  embeddings = outputs.last_hidden_state.mean(dim=1)
41
  return embeddings
42
 
43
- # Sidebar for file upload
44
- st.sidebar.title("Upload PDF")
45
- uploaded_files = st.sidebar.file_uploader("Choose a PDF file(s)", type="pdf", accept_multiple_files=True)
 
 
46
 
47
- if uploaded_files:
48
- pdf_chunks_embeddings = {}
49
-
50
- for uploaded_file in uploaded_files:
51
- pdf_name = uploaded_file.name
52
- st.write(f"### Processing `{pdf_name}`...")
53
-
54
- # Extract text from the uploaded PDF
55
- text = extract_text_from_pdf(uploaded_file)
56
-
57
- # Chunkize the extracted text
58
- chunks = chunkize_text(text)
59
-
60
- # Generate embeddings for each chunk
61
- embeddings = get_embeddings(chunks)
62
-
63
- # Store the chunks and embeddings
64
- pdf_chunks_embeddings[pdf_name] = {
65
- 'chunks': chunks,
66
- 'embeddings': embeddings
67
- }
68
 
69
- # Display chunks and embeddings
70
- st.write(f"#### Chunks and Embeddings for `{pdf_name}`")
71
- for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
72
- st.write(f"**Chunk {i+1}:**\n{chunk}")
73
- st.write(f"**Embedding {i+1}:**\n{embedding}\n{'-'*50}")
 
 
 
 
74
 
75
- st.success("Processing completed!")
 
 
 
 
 
 
 
 
 
 
 
76
  else:
77
- st.write("Upload a PDF file to get started.")
 
4
  import torch
5
  from transformers import AutoTokenizer, AutoModel
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.chains import ConversationChain
8
+ from langchain.llms import OpenAI
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
 
11
+ # Set up the title and LinkedIn link
12
+ st.markdown("### Engr. Hamesh Raj")
13
+ st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
14
+ st.title("PDF Query Chatbot")
15
 
16
  # Load the pre-trained model and tokenizer
17
  @st.cache_resource
 
44
  embeddings = outputs.last_hidden_state.mean(dim=1)
45
  return embeddings
46
 
47
+ # Sidebar for file upload and link input
48
+ st.sidebar.title("Load PDF")
49
+ pdf_url = st.sidebar.text_input("Paste PDF link here:")
50
+ uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True)
51
+ submit_button = st.sidebar.button("Submit")
52
 
53
+ # Initialize an empty dictionary for storing processed PDFs
54
+ pdf_chunks_embeddings = {}
55
+
56
+ if submit_button:
57
+ if pdf_url:
58
+ try:
59
+ response = requests.get(pdf_url)
60
+ response.raise_for_status()
61
+ pdf_file = BytesIO(response.content)
62
+ st.write(f"Processing document from URL: {pdf_url}")
63
+ text = extract_text_from_pdf(pdf_file)
64
+ chunks = chunkize_text(text)
65
+ embeddings = get_embeddings(chunks)
66
+ pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings}
67
+ st.success("PDF processed successfully!")
68
+ except requests.exceptions.RequestException as e:
69
+ st.error(f"Error loading PDF from URL: {e}")
 
 
 
 
70
 
71
+ if uploaded_files:
72
+ for uploaded_file in uploaded_files:
73
+ pdf_name = uploaded_file.name
74
+ st.write(f"Processing `{pdf_name}`...")
75
+ text = extract_text_from_pdf(uploaded_file)
76
+ chunks = chunkize_text(text)
77
+ embeddings = get_embeddings(chunks)
78
+ pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings}
79
+ st.success("PDF(s) processed successfully!")
80
 
81
+ # Chatbot section for querying the PDF content
82
+ st.write("### PDF Query Chatbot")
83
+ if pdf_chunks_embeddings:
84
+ chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings())
85
+
86
+ query = st.text_input("Enter your query here:")
87
+ if query:
88
+ # Generate a response from the chatbot based on the processed PDFs
89
+ for pdf_name, data in pdf_chunks_embeddings.items():
90
+ chatbot.add_documents(data['chunks'])
91
+ response = chatbot.run(query)
92
+ st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}")
93
  else:
94
+ st.write("No PDFs processed yet. Please submit a PDF to get started.")