tokensandcharms commited on
Commit
7211b51
·
1 Parent(s): bbb7270

Enhance .gitignore and improve app.py functionality. Added additional file types to .gitignore for better exclusion. Updated app.py to manage upload progress with Streamlit session state, improved UI layout, and added documentation for user guidance.

Browse files
Files changed (6) hide show
  1. .gitignore +9 -1
  2. app.py +44 -63
  3. libgen_scraper.py +482 -0
  4. links.json +17 -0
  5. pages/1_Upload.py +34 -0
  6. pages/2_Chat.py +42 -0
.gitignore CHANGED
@@ -1 +1,9 @@
1
- .env
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ downloads/
3
+ pdfs/
4
+ __pycache__/
5
+ *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ *.pyw
9
+ *.pyz
app.py CHANGED
@@ -57,17 +57,33 @@ def process_upload(upload_type, file_or_link, file_name=None):
57
  chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
58
 
59
  vectors = []
 
 
 
 
 
 
 
 
 
 
 
60
  with ThreadPoolExecutor() as executor:
61
- futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
62
 
 
63
  for future in as_completed(futures):
64
  vectors.append(future.result())
65
- # Update progress
66
- progress = len(vectors) / len(chunks)
67
- st.session_state.upload_progress.progress(progress)
 
 
 
68
 
69
  print(f"Generated {len(vectors)} vectors")
70
 
 
71
  index.upsert(vectors=vectors)
72
  print("Vectors upserted to Pinecone")
73
 
@@ -122,7 +138,6 @@ def chat_with_ai(message):
122
  "doc_name": result['metadata']['doc_name'],
123
  "chunk_index": result['metadata']['chunk_index'],
124
  "text": result['metadata']['text'],
125
- "type": result['metadata']['type']
126
  }
127
  for result in results
128
  ]
@@ -135,63 +150,29 @@ def clear_database():
135
  print("Database cleared")
136
  return "Database cleared successfully."
137
 
138
- # Streamlit UI
139
- st.set_page_config(layout="wide")
140
- st.title("Upload and Chat with PDFs")
 
 
141
 
142
- # Create three columns
143
- col1, col2, col3 = st.columns([1, 1, 1])
144
 
145
- with col1:
146
- st.header("Upload")
147
-
148
- # PDF upload
149
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
150
-
151
- if st.button("Process All"):
152
- st.session_state.upload_progress = st.progress(0)
153
- with st.spinner("Processing uploads..."):
154
- results = []
155
- if uploaded_file:
156
- pdf_result = process_upload("PDF", uploaded_file, uploaded_file.name)
157
- results.append(pdf_result)
158
-
159
- if results:
160
- for result in results:
161
- st.success(result)
162
- else:
163
- st.warning("No content uploaded. Please provide at least one input.")
164
- st.session_state.upload_progress.empty()
165
-
166
- if st.button("Clear Database"):
167
- result = clear_database()
168
- st.success(result)
169
-
170
- with col2:
171
- st.header("Chat")
172
- user_input = st.text_input("Ask a question about the uploaded content:")
173
- if st.button("Send"):
174
- if user_input:
175
- print(f"Sending user input: {user_input}")
176
- st.session_state.chat_progress = st.progress(0)
177
- response, sources = chat_with_ai(user_input)
178
- st.session_state.chat_progress.progress(1.0)
179
- st.markdown("**You:** " + user_input)
180
- st.markdown("**AI:** " + response)
181
-
182
- # Store sources in session state for display in col3
183
- st.session_state.sources = sources
184
- st.session_state.chat_progress.empty()
185
- else:
186
- print("Empty user input")
187
- st.warning("Please enter a question.")
188
-
189
- with col3:
190
- st.header("Source Chunks")
191
- if 'sources' in st.session_state and st.session_state.sources:
192
- for i, source in enumerate(st.session_state.sources, 1):
193
- with st.expander(f"Source {i} - {source['type']} ({source['doc_name']})"):
194
- st.markdown(f"**Chunk Index:** {source['chunk_index']}")
195
- st.text(source['text'])
196
- else:
197
- st.info("Ask a question to see source chunks here.")
 
57
  chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
58
 
59
  vectors = []
60
+ total_chunks = len(chunks)
61
+
62
+ # Use st.session_state to manage progress bar across function calls if needed on the page
63
+ if 'upload_progress' in st.session_state and hasattr(st.session_state.upload_progress, 'progress'):
64
+ progress_bar = st.session_state.upload_progress
65
+ else:
66
+ # If called outside the context of the upload page button press, handle appropriately
67
+ # For now, let's assume it's called from the Upload page context where progress is set
68
+ pass
69
+
70
+
71
  with ThreadPoolExecutor() as executor:
72
+ futures = {executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name): i for i, chunk in enumerate(chunks)}
73
 
74
+ processed_count = 0
75
  for future in as_completed(futures):
76
  vectors.append(future.result())
77
+ processed_count += 1
78
+ # Update progress if progress_bar exists
79
+ if 'progress_bar' in locals() and progress_bar:
80
+ current_progress = processed_count / total_chunks
81
+ progress_bar.progress(current_progress)
82
+
83
 
84
  print(f"Generated {len(vectors)} vectors")
85
 
86
+ # Consider batching upserts for very large documents
87
  index.upsert(vectors=vectors)
88
  print("Vectors upserted to Pinecone")
89
 
 
138
  "doc_name": result['metadata']['doc_name'],
139
  "chunk_index": result['metadata']['chunk_index'],
140
  "text": result['metadata']['text'],
 
141
  }
142
  for result in results
143
  ]
 
150
  print("Database cleared")
151
  return "Database cleared successfully."
152
 
153
+ # Streamlit Main Page
154
+ st.set_page_config(
155
+ page_title="RAG Chat Home",
156
+ page_icon="👋",
157
+ )
158
 
159
+ st.title("Welcome to RAG Chat! 👋")
 
160
 
161
+ st.sidebar.success("Select a page above.")
162
+
163
+ st.markdown(
164
+ """
165
+ This application allows you to upload PDF documents and chat with an AI
166
+ about their content.
167
+
168
+ **👈 Select a page from the sidebar** to get started:
169
+ - **Upload:** Add your PDF documents to the knowledge base.
170
+ - **Chat:** Ask questions about the documents you've uploaded.
171
+
172
+ The AI uses Retrieval-Augmented Generation (RAG) to find relevant sections
173
+ from your documents and provide informed answers.
174
+ """
175
+ )
176
+
177
+ # No UI elements here, just the core logic and initialization above.
178
+ # The pages in the 'pages' directory will handle the UI.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
libgen_scraper.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import os
5
+ import time
6
+ import subprocess
7
+ import uuid
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from urllib.parse import urlparse, unquote, parse_qs
10
+ from lxml import html # Import lxml
11
+ from dotenv import load_dotenv
12
+ from pypdf import PdfReader # Use pypdf instead of PyPDF2
13
+ from openai import OpenAI
14
+ from pinecone import Pinecone
15
+ # cssselect is used by lxml's .cssselect() method, ensure it's installed
16
+
17
+ # --- Initialization ---
18
+ load_dotenv()
19
+
20
+ # Set up OpenAI client
21
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
22
+
23
+ # Set up Pinecone
24
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
25
+
26
+ index_name = "main" # Your index name
27
+ try:
28
+ index = pc.Index(index_name)
29
+ print(f"Connected to Pinecone index: {index_name}")
30
+ # Optional: Check index stats
31
+ # print(index.describe_index_stats())
32
+ except Exception as e:
33
+ print(f"Error connecting to Pinecone index '{index_name}': {e}")
34
+ print("Please ensure the index exists and API keys are correct.")
35
+ exit()
36
+
37
+ # URL provided by the user
38
+ url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1"
39
+
40
+ # Headers mimicking a browser request (Removed Host)
41
+ base_headers = {
42
+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
43
+ 'accept-encoding': 'gzip, deflate, br, zstd',
44
+ 'accept-language': 'en-US,en;q=0.9',
45
+ 'connection': 'keep-alive',
46
+ 'dnt': '1',
47
+ 'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"',
48
+ 'sec-ch-ua-mobile': '?0',
49
+ 'sec-ch-ua-platform': '"macOS"',
50
+ 'sec-fetch-dest': 'document',
51
+ 'sec-fetch-mode': 'navigate',
52
+ 'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains
53
+ 'sec-fetch-user': '?1',
54
+ 'upgrade-insecure-requests': '1',
55
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
56
+ }
57
+
58
+ print(f"Attempting to fetch: {url}")
59
+
60
+ try:
61
+ response = requests.get(url, headers=base_headers)
62
+ response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
63
+ print("Successfully fetched page content.")
64
+
65
+ soup = BeautifulSoup(response.text, 'html.parser')
66
+
67
+ # Find the main table containing search results (often class='c' in libgen)
68
+ # Inspecting the source, the table seems to be the 3rd table on the page,
69
+ # or more reliably, the one with width="100%" and rules="rows"
70
+ results_table = soup.find('table', {'rules': 'rows', 'width': '100%'})
71
+
72
+ book_links = []
73
+ base_url = "https://libgen.rs/"
74
+
75
+ if results_table:
76
+ print("Found results table. Processing rows...")
77
+ rows = results_table.find_all('tr')
78
+ print(f"Found {len(rows) - 1} potential book entries (excluding header).")
79
+
80
+ # Skip the header row (index 0)
81
+ for row in rows[1:]:
82
+ cells = row.find_all('td')
83
+ # Ensure the row has enough cells (at least 3 for the link)
84
+ if len(cells) > 2:
85
+ link_cell = cells[2] # The third column usually contains the title link
86
+ link_tag = link_cell.find('a')
87
+ if link_tag and link_tag.has_attr('href'):
88
+ relative_link = link_tag['href']
89
+ # Ensure it's a book link (often starts with 'book/')
90
+ if relative_link.startswith('book/'):
91
+ full_link = base_url + relative_link
92
+ book_links.append(full_link)
93
+ else:
94
+ print("Skipping row with insufficient cells.")
95
+
96
+ print(f"Extracted {len(book_links)} book links.")
97
+
98
+ # Save the links to a JSON file
99
+ output_filename = 'links.json'
100
+ with open(output_filename, 'w') as f:
101
+ json.dump(book_links, f, indent=4)
102
+ print(f"Successfully saved links to {output_filename}")
103
+
104
+ else:
105
+ print("Could not find the results table. Check the HTML structure or selectors.")
106
+
107
+ except requests.exceptions.RequestException as e:
108
+ print(f"Error fetching URL: {e}")
109
+ except Exception as e:
110
+ print(f"An error occurred: {e}")
111
+
112
+ # Known download host patterns (check hostname ENDS WITH these)
113
+ DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc']
114
+
115
+ def get_embedding(text):
116
+ try:
117
+ response = client.embeddings.create(input=text, model="text-embedding-3-large")
118
+ return response.data[0].embedding
119
+ except Exception as e:
120
+ print(f"Error getting embedding: {e}")
121
+ return None
122
+
123
+ def convert_djvu_to_pdf(djvu_filepath):
124
+ """Converts a DJVU file to PDF using djvu2pdf command line tool."""
125
+ pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf"
126
+ command = ["djvu2pdf", djvu_filepath, pdf_filepath]
127
+ print(f"Converting {os.path.basename(djvu_filepath)} to PDF...")
128
+ try:
129
+ result = subprocess.run(command, check=True, capture_output=True, text=True)
130
+ print(f"Successfully converted to {os.path.basename(pdf_filepath)}")
131
+ return pdf_filepath
132
+ except FileNotFoundError:
133
+ print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.")
134
+ return None
135
+ except subprocess.CalledProcessError as e:
136
+ print(f"Error during conversion: {e}")
137
+ print(f"Stderr: {e.stderr}")
138
+ return None
139
+ except Exception as e:
140
+ print(f"An unexpected error occurred during conversion: {e}")
141
+ return None
142
+
143
+ def process_and_upsert_pdf(pdf_filepath, original_filename):
144
+ """Reads PDF, chunks text, gets embeddings, and upserts to Pinecone."""
145
+ print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}")
146
+ doc_id = str(uuid.uuid4())
147
+ try:
148
+ reader = PdfReader(pdf_filepath)
149
+ text = ""
150
+ for page in reader.pages:
151
+ page_text = page.extract_text()
152
+ if page_text: # Add text only if extraction succeeded
153
+ text += page_text + "\n"
154
+
155
+ if not text:
156
+ print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}")
157
+ return f"Processed (No Text): {original_filename}"
158
+
159
+ content_length = len(text)
160
+ print(f"Extracted text length: {content_length}")
161
+
162
+ # Simple chunking (adjust size as needed)
163
+ chunk_size = 2000
164
+ chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
165
+ print(f"Split into {len(chunks)} chunks.")
166
+
167
+ vectors = []
168
+ for i, chunk in enumerate(chunks):
169
+ embedding = get_embedding(chunk)
170
+ if embedding:
171
+ vectors.append((
172
+ f"{doc_id}_{i}",
173
+ embedding,
174
+ {
175
+ "text": chunk,
176
+ "type": "PDF",
177
+ "doc_id": doc_id,
178
+ "doc_name": original_filename, # Store the original filename
179
+ "chunk_index": i
180
+ }
181
+ ))
182
+ else:
183
+ print(f"Skipping chunk {i} due to embedding error.")
184
+
185
+ if not vectors:
186
+ print("No vectors generated for upsert.")
187
+ return f"Processed (No Vectors): {original_filename}"
188
+
189
+ # Upsert in batches if necessary (Pinecone recommends batching)
190
+ batch_size = 100
191
+ for i in range(0, len(vectors), batch_size):
192
+ batch = vectors[i:i+batch_size]
193
+ try:
194
+ index.upsert(vectors=batch)
195
+ print(f"Upserted batch {i//batch_size + 1} to Pinecone.")
196
+ except Exception as e:
197
+ print(f"Error upserting batch to Pinecone: {e}")
198
+ # Decide if you want to stop or continue with other batches
199
+
200
+ print(f"Successfully processed and upserted {original_filename} to Pinecone.")
201
+ return f"Upserted: {original_filename}"
202
+
203
+ except Exception as e:
204
+ print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}")
205
+ return f"Error (Processing): {original_filename}"
206
+
207
+ def get_final_download_link(intermediate_page_url):
208
+ """Visits an intermediate page (e.g., books.ms/main/HASH)
209
+ and scrapes the final download link using the selector #download a.
210
+ """
211
+ print(f"Fetching final link from intermediate page: {intermediate_page_url}")
212
+ try:
213
+ # Update Host header for the specific request
214
+ request_headers = base_headers.copy()
215
+ parsed_url = urlparse(intermediate_page_url)
216
+ if parsed_url.netloc:
217
+ request_headers['Host'] = parsed_url.netloc
218
+
219
+ response = requests.get(intermediate_page_url, headers=request_headers, timeout=20)
220
+ response.raise_for_status()
221
+
222
+ tree = html.fromstring(response.content)
223
+ found_link = None
224
+
225
+ # --- Attempt the simple, effective selector ---
226
+ css_selector = "#download a" # Target first anchor within #download
227
+ print(f"Attempting CSS selector: {css_selector}")
228
+ link_elements = tree.cssselect(css_selector)
229
+ if link_elements:
230
+ link_tag = link_elements[0] # Take the first one found
231
+ href = link_tag.get('href')
232
+ if href:
233
+ parsed_href = urlparse(href)
234
+ # Validation:
235
+ if (parsed_href.scheme and parsed_href.netloc and
236
+ '/main/' in parsed_href.path and
237
+ any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)):
238
+ print(f"Found final download link via CSS selector: {href}")
239
+ found_link = href
240
+ else:
241
+ # If the first link doesn't validate, maybe log it but don't proceed
242
+ print(f"Selector '{css_selector}' found link, but failed validation: {href}")
243
+ else:
244
+ print(f"Selector '{css_selector}' found link tag, but it has no href.")
245
+ else:
246
+ print(f"CSS selector {css_selector} did not find any elements.")
247
+
248
+ # --- Return result ---
249
+ if found_link:
250
+ return found_link
251
+ else:
252
+ # If no valid link was found after checking the first #download a
253
+ print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}")
254
+ return None
255
+
256
+ except requests.exceptions.Timeout:
257
+ print(f"Timeout error fetching intermediate page {intermediate_page_url}")
258
+ return None
259
+ except requests.exceptions.RequestException as e:
260
+ if e.response is not None:
261
+ print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}")
262
+ else:
263
+ print(f"Error fetching intermediate page {intermediate_page_url}: {e}")
264
+ return None
265
+ except Exception as e:
266
+ print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}")
267
+ return None
268
+
269
+ def download_file_and_process(download_url, download_dir):
270
+ """Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert.
271
+ Returns a status message.
272
+ """
273
+ if not download_url:
274
+ return "Skipped: No download URL provided."
275
+
276
+ processing_status = "" # To store the outcome of PDF processing/upserting
277
+ original_filename = "Unknown"
278
+ final_filepath = None # Path to the file to be processed (PDF)
279
+ djvu_filepath_to_delete = None
280
+
281
+ try:
282
+ # --- Downloading ---
283
+ parsed_url = urlparse(download_url)
284
+ path_parts = [part for part in parsed_url.path.split('/') if part]
285
+ filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}"
286
+ original_filename = filename_base # Keep original name for metadata
287
+
288
+ print(f"Attempting to download: {download_url}")
289
+ response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout
290
+ response.raise_for_status()
291
+
292
+ # --- Determine File Path and Extension ---
293
+ content_disposition = response.headers.get('Content-Disposition')
294
+ extension = '.pdf' # Default
295
+ if content_disposition:
296
+ if 'filename=' in content_disposition:
297
+ disp_filename = content_disposition.split('filename=')[-1].strip('"\'')
298
+ if '.' in disp_filename:
299
+ extension = os.path.splitext(disp_filename)[1].lower()
300
+ else:
301
+ # Check extension from URL path if no content-disposition
302
+ if '.' in filename_base:
303
+ url_ext = os.path.splitext(filename_base)[1].lower()
304
+ if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed
305
+ extension = url_ext
306
+
307
+ filename = filename_base
308
+ if not filename.lower().endswith(extension):
309
+ filename += extension
310
+
311
+ download_filepath = os.path.join(download_dir, filename)
312
+
313
+ # --- Save File ---
314
+ if os.path.exists(download_filepath):
315
+ print(f"File already exists: {filename}")
316
+ # Decide if we should still process it for Pinecone
317
+ if download_filepath.lower().endswith('.pdf'):
318
+ final_filepath = download_filepath
319
+ elif download_filepath.lower().endswith('.djvu'):
320
+ # Check if corresponding PDF exists from previous run
321
+ pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf"
322
+ if os.path.exists(pdf_equiv):
323
+ print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}")
324
+ final_filepath = pdf_equiv
325
+ else:
326
+ # Convert existing DJVU
327
+ print("DJVU exists but PDF doesn't. Converting...")
328
+ converted_pdf = convert_djvu_to_pdf(download_filepath)
329
+ if converted_pdf:
330
+ final_filepath = converted_pdf
331
+ else:
332
+ return f"Skipped (Exists, Non-PDF/DJVU): {filename}"
333
+ else:
334
+ print(f"Downloading to: {download_filepath}")
335
+ with open(download_filepath, 'wb') as f:
336
+ for chunk in response.iter_content(chunk_size=8192):
337
+ f.write(chunk)
338
+ print(f"Successfully downloaded: {filename}")
339
+
340
+ # --- Post-Download Processing ---
341
+ if download_filepath.lower().endswith('.pdf'):
342
+ final_filepath = download_filepath
343
+ elif download_filepath.lower().endswith('.djvu'):
344
+ converted_pdf = convert_djvu_to_pdf(download_filepath)
345
+ if converted_pdf:
346
+ final_filepath = converted_pdf
347
+ djvu_filepath_to_delete = download_filepath # Mark original for deletion
348
+ else:
349
+ print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.")
350
+ return f"Success (DL Only): {filename}"
351
+
352
+ # --- Pinecone Upsert Trigger ---
353
+ if final_filepath and os.path.exists(final_filepath):
354
+ processing_status = process_and_upsert_pdf(final_filepath, original_filename)
355
+
356
+ # Optional: Delete original DJVU after successful conversion and processing
357
+ if djvu_filepath_to_delete and 'Error' not in processing_status:
358
+ try:
359
+ os.remove(djvu_filepath_to_delete)
360
+ print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}")
361
+ except Exception as e:
362
+ print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}")
363
+
364
+ else:
365
+ processing_status = "Skipped Upsert (No PDF)"
366
+
367
+ return f"Download OK. Status: {processing_status}"
368
+
369
+ except requests.exceptions.Timeout:
370
+ print(f"Timeout error downloading {download_url}")
371
+ return f"Error (Timeout): {original_filename}"
372
+ except requests.exceptions.RequestException as e:
373
+ print(f"Error downloading {download_url}: {e}")
374
+ return f"Error (RequestException): {original_filename}"
375
+ except Exception as e:
376
+ print(f"An unexpected error occurred during download/process of {original_filename}: {e}")
377
+ return f"Error (Unexpected): {original_filename}"
378
+
379
+ # --- Main Execution ---
380
+
381
+ input_filename = 'links.json'
382
+ download_dir = 'downloads'
383
+ max_workers = 3 # Reduce workers slightly due to processing load
384
+
385
+ # Create download directory if it doesn't exist
386
+ if not os.path.exists(download_dir):
387
+ os.makedirs(download_dir)
388
+ print(f"Created directory: {download_dir}")
389
+
390
+ # --- Read original libgen.rs book page URLs ---
391
+ try:
392
+ with open(input_filename, 'r') as f:
393
+ # Load all URLs as originally intended
394
+ libgen_book_page_urls = json.load(f)
395
+ print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}")
396
+
397
+ except FileNotFoundError:
398
+ print(f"Error: {input_filename} not found. Please run the initial link scraping part first.")
399
+ exit()
400
+ except json.JSONDecodeError:
401
+ print(f"Error: Could not decode JSON from {input_filename}.")
402
+ exit()
403
+
404
+ if not libgen_book_page_urls:
405
+ print("No book page URLs found in the file. Exiting.")
406
+ exit()
407
+
408
+ # --- Stage 1: Construct intermediate URLs and get final download links ---
409
+ final_download_links = []
410
+ intermediate_urls_to_try = []
411
+
412
+ print("\n--- Constructing Intermediate URLs ---")
413
+ # Process all URLs again
414
+ for url in libgen_book_page_urls:
415
+ try:
416
+ parsed_libgen_url = urlparse(url)
417
+ query_params = parse_qs(parsed_libgen_url.query)
418
+ md5_list = query_params.get('md5')
419
+ if md5_list:
420
+ md5 = md5_list[0]
421
+ intermediate_url = f"http://books.ms/main/{md5}"
422
+ intermediate_urls_to_try.append(intermediate_url)
423
+ # Maybe remove verbose printing for full run
424
+ # print(f"Constructed: {intermediate_url} from {url}")
425
+ else:
426
+ print(f"Could not extract MD5 from {url}")
427
+ except Exception as e:
428
+ print(f"Error processing libgen URL {url}: {e}")
429
+
430
+ print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---")
431
+ if intermediate_urls_to_try:
432
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
433
+ # Use the renamed function get_final_download_link
434
+ future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try}
435
+ for future in as_completed(future_to_url):
436
+ intermediate_url = future_to_url[future]
437
+ try:
438
+ result = future.result()
439
+ if result:
440
+ final_download_links.append(result)
441
+ except Exception as exc:
442
+ print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}')
443
+
444
+ print(f"\nFound {len(final_download_links)} final download links.")
445
+
446
+ # --- Stage 2: Download, Convert, and Process files concurrently ---
447
+ print("\n--- Downloading, Converting, Processing Files ---")
448
+ download_process_results = []
449
+ if final_download_links:
450
+ # Use the new function that handles download, conversion, and upsert trigger
451
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
452
+ future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links}
453
+ for future in as_completed(future_to_download):
454
+ link = future_to_download[future]
455
+ try:
456
+ result_message = future.result()
457
+ download_process_results.append(result_message)
458
+ print(f"Overall Result for {link}: {result_message}")
459
+ except Exception as exc:
460
+ print(f'Download/Processing {link} generated an exception: {exc}')
461
+ download_process_results.append(f"Error (Exception): {link}")
462
+ else:
463
+ print("No final download links found, skipping download/process stage.")
464
+
465
+ # --- Final Summary ---
466
+ print("\n--- Final Summary ---")
467
+ # Analyze the results strings for a more detailed summary (optional)
468
+ success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted'))
469
+ success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)'))
470
+ success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)'))
471
+ skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped'))
472
+ error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count
473
+
474
+ print(f"Total final links attempted: {len(final_download_links)}")
475
+ print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}")
476
+ print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}")
477
+ print(f"Skipped Download (e.g., already exists): {skipped_dl_count}")
478
+ print(f"Errors (Download/Conversion/Process/Upsert): {error_count}")
479
+
480
+ print(f"\nDownloads attempted in the '{download_dir}' directory.")
481
+
482
+ # --- End Main Execution ---
links.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "https://libgen.rs/book/index.php?md5=372E34D136FBF39DCE00460D9E8F1F52",
3
+ "https://libgen.rs/book/index.php?md5=EB252D785B9D104EC533CF5326D89DEF",
4
+ "https://libgen.rs/book/index.php?md5=824483B0D731CBB3221C722455E3CAC8",
5
+ "https://libgen.rs/book/index.php?md5=CB207C615844574D6384B210090C46D0",
6
+ "https://libgen.rs/book/index.php?md5=270015C07518D5C2293CD613D3B75F9D",
7
+ "https://libgen.rs/book/index.php?md5=560ECD8FBD2BA7D757B5A51DA042B50F",
8
+ "https://libgen.rs/book/index.php?md5=3E9400AA7C4C99881ED7EDA013A27C0E",
9
+ "https://libgen.rs/book/index.php?md5=8558FC34C2C407926FF051201EFEDD50",
10
+ "https://libgen.rs/book/index.php?md5=1D4206FF2F3B3EB3D7E15A95D31E5B18",
11
+ "https://libgen.rs/book/index.php?md5=C7202A81EFF198F776DB05CB8EFBA0BD",
12
+ "https://libgen.rs/book/index.php?md5=4AEE828F0332D2FBC34210AA79602FB3",
13
+ "https://libgen.rs/book/index.php?md5=3EC35852AD8E56F72EF977EF636070DC",
14
+ "https://libgen.rs/book/index.php?md5=C5A01443018DD3AFA28CBF8CF0AF8CE8",
15
+ "https://libgen.rs/book/index.php?md5=0D5CB47FF53DD63D764ACEE476B3C2AB",
16
+ "https://libgen.rs/book/index.php?md5=E4D3667D48E2DF2FE6491EE88FCFBB79"
17
+ ]
pages/1_Upload.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app import process_upload, clear_database
3
+
4
+ st.title("Upload PDFs")
5
+
6
+ st.header("Upload")
7
+
8
+ # PDF upload
9
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
10
+
11
+ if st.button("Process All"):
12
+ if 'upload_progress' not in st.session_state:
13
+ st.session_state.upload_progress = st.progress(0)
14
+
15
+ with st.spinner("Processing uploads..."):
16
+ results = []
17
+ if uploaded_file:
18
+ pdf_result = process_upload("PDF", uploaded_file, uploaded_file.name)
19
+ results.append(pdf_result)
20
+
21
+ if results:
22
+ for result in results:
23
+ st.success(result)
24
+ else:
25
+ st.warning("No content uploaded. Please provide at least one input.")
26
+
27
+ # Check if the progress bar exists before trying to empty it
28
+ if hasattr(st.session_state, 'upload_progress'):
29
+ st.session_state.upload_progress.empty()
30
+ del st.session_state.upload_progress # Clean up state
31
+
32
+ if st.button("Clear Database"):
33
+ result = clear_database()
34
+ st.success(result)
pages/2_Chat.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from app import chat_with_ai
3
+
4
+ st.title("Chat with PDFs")
5
+
6
+ # Initialize session state for sources if it doesn't exist
7
+ if 'sources' not in st.session_state:
8
+ st.session_state.sources = None
9
+
10
+ # Create two columns for chat and sources
11
+ col1, col2 = st.columns([2, 1])
12
+
13
+ with col1:
14
+ st.header("Chat")
15
+ user_input = st.text_input("Ask a question about the uploaded content:")
16
+ if st.button("Send"):
17
+ if user_input:
18
+ print(f"Sending user input: {user_input}")
19
+ st.session_state.chat_progress = st.progress(0)
20
+ response, sources = chat_with_ai(user_input)
21
+ st.session_state.chat_progress.progress(1.0)
22
+ st.markdown("**You:** " + user_input)
23
+ st.markdown("**AI:** " + response)
24
+
25
+ # Store sources in session state for display in col2
26
+ st.session_state.sources = sources
27
+ st.session_state.chat_progress.empty()
28
+ # Clean up state
29
+ del st.session_state.chat_progress
30
+ else:
31
+ print("Empty user input")
32
+ st.warning("Please enter a question.")
33
+
34
+ with col2:
35
+ st.header("Source Chunks")
36
+ if st.session_state.sources:
37
+ for i, source in enumerate(st.session_state.sources, 1):
38
+ with st.expander(f"Source {i} - {source['doc_name']}"):
39
+ st.markdown(f"**Chunk Index:** {source['chunk_index']}")
40
+ st.text(source['text'])
41
+ else:
42
+ st.info("Ask a question to see source chunks here.")