rag-chat / libgen_scraper.py
tokensandcharms's picture
Enhance .gitignore and improve app.py functionality. Added additional file types to .gitignore for better exclusion. Updated app.py to manage upload progress with Streamlit session state, improved UI layout, and added documentation for user guidance.
7211b51
raw
history blame contribute delete
21.4 kB
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import subprocess
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse, unquote, parse_qs
from lxml import html # Import lxml
from dotenv import load_dotenv
from pypdf import PdfReader # Use pypdf instead of PyPDF2
from openai import OpenAI
from pinecone import Pinecone
# cssselect is used by lxml's .cssselect() method, ensure it's installed
# --- Initialization ---
load_dotenv()
# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Set up Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "main" # Your index name
try:
index = pc.Index(index_name)
print(f"Connected to Pinecone index: {index_name}")
# Optional: Check index stats
# print(index.describe_index_stats())
except Exception as e:
print(f"Error connecting to Pinecone index '{index_name}': {e}")
print("Please ensure the index exists and API keys are correct.")
exit()
# URL provided by the user
url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1"
# Headers mimicking a browser request (Removed Host)
base_headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-encoding': 'gzip, deflate, br, zstd',
'accept-language': 'en-US,en;q=0.9',
'connection': 'keep-alive',
'dnt': '1',
'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
}
print(f"Attempting to fetch: {url}")
try:
response = requests.get(url, headers=base_headers)
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
print("Successfully fetched page content.")
soup = BeautifulSoup(response.text, 'html.parser')
# Find the main table containing search results (often class='c' in libgen)
# Inspecting the source, the table seems to be the 3rd table on the page,
# or more reliably, the one with width="100%" and rules="rows"
results_table = soup.find('table', {'rules': 'rows', 'width': '100%'})
book_links = []
base_url = "https://libgen.rs/"
if results_table:
print("Found results table. Processing rows...")
rows = results_table.find_all('tr')
print(f"Found {len(rows) - 1} potential book entries (excluding header).")
# Skip the header row (index 0)
for row in rows[1:]:
cells = row.find_all('td')
# Ensure the row has enough cells (at least 3 for the link)
if len(cells) > 2:
link_cell = cells[2] # The third column usually contains the title link
link_tag = link_cell.find('a')
if link_tag and link_tag.has_attr('href'):
relative_link = link_tag['href']
# Ensure it's a book link (often starts with 'book/')
if relative_link.startswith('book/'):
full_link = base_url + relative_link
book_links.append(full_link)
else:
print("Skipping row with insufficient cells.")
print(f"Extracted {len(book_links)} book links.")
# Save the links to a JSON file
output_filename = 'links.json'
with open(output_filename, 'w') as f:
json.dump(book_links, f, indent=4)
print(f"Successfully saved links to {output_filename}")
else:
print("Could not find the results table. Check the HTML structure or selectors.")
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
except Exception as e:
print(f"An error occurred: {e}")
# Known download host patterns (check hostname ENDS WITH these)
DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc']
def get_embedding(text):
try:
response = client.embeddings.create(input=text, model="text-embedding-3-large")
return response.data[0].embedding
except Exception as e:
print(f"Error getting embedding: {e}")
return None
def convert_djvu_to_pdf(djvu_filepath):
"""Converts a DJVU file to PDF using djvu2pdf command line tool."""
pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf"
command = ["djvu2pdf", djvu_filepath, pdf_filepath]
print(f"Converting {os.path.basename(djvu_filepath)} to PDF...")
try:
result = subprocess.run(command, check=True, capture_output=True, text=True)
print(f"Successfully converted to {os.path.basename(pdf_filepath)}")
return pdf_filepath
except FileNotFoundError:
print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.")
return None
except subprocess.CalledProcessError as e:
print(f"Error during conversion: {e}")
print(f"Stderr: {e.stderr}")
return None
except Exception as e:
print(f"An unexpected error occurred during conversion: {e}")
return None
def process_and_upsert_pdf(pdf_filepath, original_filename):
"""Reads PDF, chunks text, gets embeddings, and upserts to Pinecone."""
print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}")
doc_id = str(uuid.uuid4())
try:
reader = PdfReader(pdf_filepath)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text: # Add text only if extraction succeeded
text += page_text + "\n"
if not text:
print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}")
return f"Processed (No Text): {original_filename}"
content_length = len(text)
print(f"Extracted text length: {content_length}")
# Simple chunking (adjust size as needed)
chunk_size = 2000
chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
print(f"Split into {len(chunks)} chunks.")
vectors = []
for i, chunk in enumerate(chunks):
embedding = get_embedding(chunk)
if embedding:
vectors.append((
f"{doc_id}_{i}",
embedding,
{
"text": chunk,
"type": "PDF",
"doc_id": doc_id,
"doc_name": original_filename, # Store the original filename
"chunk_index": i
}
))
else:
print(f"Skipping chunk {i} due to embedding error.")
if not vectors:
print("No vectors generated for upsert.")
return f"Processed (No Vectors): {original_filename}"
# Upsert in batches if necessary (Pinecone recommends batching)
batch_size = 100
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i+batch_size]
try:
index.upsert(vectors=batch)
print(f"Upserted batch {i//batch_size + 1} to Pinecone.")
except Exception as e:
print(f"Error upserting batch to Pinecone: {e}")
# Decide if you want to stop or continue with other batches
print(f"Successfully processed and upserted {original_filename} to Pinecone.")
return f"Upserted: {original_filename}"
except Exception as e:
print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}")
return f"Error (Processing): {original_filename}"
def get_final_download_link(intermediate_page_url):
"""Visits an intermediate page (e.g., books.ms/main/HASH)
and scrapes the final download link using the selector #download a.
"""
print(f"Fetching final link from intermediate page: {intermediate_page_url}")
try:
# Update Host header for the specific request
request_headers = base_headers.copy()
parsed_url = urlparse(intermediate_page_url)
if parsed_url.netloc:
request_headers['Host'] = parsed_url.netloc
response = requests.get(intermediate_page_url, headers=request_headers, timeout=20)
response.raise_for_status()
tree = html.fromstring(response.content)
found_link = None
# --- Attempt the simple, effective selector ---
css_selector = "#download a" # Target first anchor within #download
print(f"Attempting CSS selector: {css_selector}")
link_elements = tree.cssselect(css_selector)
if link_elements:
link_tag = link_elements[0] # Take the first one found
href = link_tag.get('href')
if href:
parsed_href = urlparse(href)
# Validation:
if (parsed_href.scheme and parsed_href.netloc and
'/main/' in parsed_href.path and
any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)):
print(f"Found final download link via CSS selector: {href}")
found_link = href
else:
# If the first link doesn't validate, maybe log it but don't proceed
print(f"Selector '{css_selector}' found link, but failed validation: {href}")
else:
print(f"Selector '{css_selector}' found link tag, but it has no href.")
else:
print(f"CSS selector {css_selector} did not find any elements.")
# --- Return result ---
if found_link:
return found_link
else:
# If no valid link was found after checking the first #download a
print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}")
return None
except requests.exceptions.Timeout:
print(f"Timeout error fetching intermediate page {intermediate_page_url}")
return None
except requests.exceptions.RequestException as e:
if e.response is not None:
print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}")
else:
print(f"Error fetching intermediate page {intermediate_page_url}: {e}")
return None
except Exception as e:
print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}")
return None
def download_file_and_process(download_url, download_dir):
"""Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert.
Returns a status message.
"""
if not download_url:
return "Skipped: No download URL provided."
processing_status = "" # To store the outcome of PDF processing/upserting
original_filename = "Unknown"
final_filepath = None # Path to the file to be processed (PDF)
djvu_filepath_to_delete = None
try:
# --- Downloading ---
parsed_url = urlparse(download_url)
path_parts = [part for part in parsed_url.path.split('/') if part]
filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}"
original_filename = filename_base # Keep original name for metadata
print(f"Attempting to download: {download_url}")
response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout
response.raise_for_status()
# --- Determine File Path and Extension ---
content_disposition = response.headers.get('Content-Disposition')
extension = '.pdf' # Default
if content_disposition:
if 'filename=' in content_disposition:
disp_filename = content_disposition.split('filename=')[-1].strip('"\'')
if '.' in disp_filename:
extension = os.path.splitext(disp_filename)[1].lower()
else:
# Check extension from URL path if no content-disposition
if '.' in filename_base:
url_ext = os.path.splitext(filename_base)[1].lower()
if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed
extension = url_ext
filename = filename_base
if not filename.lower().endswith(extension):
filename += extension
download_filepath = os.path.join(download_dir, filename)
# --- Save File ---
if os.path.exists(download_filepath):
print(f"File already exists: {filename}")
# Decide if we should still process it for Pinecone
if download_filepath.lower().endswith('.pdf'):
final_filepath = download_filepath
elif download_filepath.lower().endswith('.djvu'):
# Check if corresponding PDF exists from previous run
pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf"
if os.path.exists(pdf_equiv):
print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}")
final_filepath = pdf_equiv
else:
# Convert existing DJVU
print("DJVU exists but PDF doesn't. Converting...")
converted_pdf = convert_djvu_to_pdf(download_filepath)
if converted_pdf:
final_filepath = converted_pdf
else:
return f"Skipped (Exists, Non-PDF/DJVU): {filename}"
else:
print(f"Downloading to: {download_filepath}")
with open(download_filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Successfully downloaded: {filename}")
# --- Post-Download Processing ---
if download_filepath.lower().endswith('.pdf'):
final_filepath = download_filepath
elif download_filepath.lower().endswith('.djvu'):
converted_pdf = convert_djvu_to_pdf(download_filepath)
if converted_pdf:
final_filepath = converted_pdf
djvu_filepath_to_delete = download_filepath # Mark original for deletion
else:
print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.")
return f"Success (DL Only): {filename}"
# --- Pinecone Upsert Trigger ---
if final_filepath and os.path.exists(final_filepath):
processing_status = process_and_upsert_pdf(final_filepath, original_filename)
# Optional: Delete original DJVU after successful conversion and processing
if djvu_filepath_to_delete and 'Error' not in processing_status:
try:
os.remove(djvu_filepath_to_delete)
print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}")
except Exception as e:
print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}")
else:
processing_status = "Skipped Upsert (No PDF)"
return f"Download OK. Status: {processing_status}"
except requests.exceptions.Timeout:
print(f"Timeout error downloading {download_url}")
return f"Error (Timeout): {original_filename}"
except requests.exceptions.RequestException as e:
print(f"Error downloading {download_url}: {e}")
return f"Error (RequestException): {original_filename}"
except Exception as e:
print(f"An unexpected error occurred during download/process of {original_filename}: {e}")
return f"Error (Unexpected): {original_filename}"
# --- Main Execution ---
input_filename = 'links.json'
download_dir = 'downloads'
max_workers = 3 # Reduce workers slightly due to processing load
# Create download directory if it doesn't exist
if not os.path.exists(download_dir):
os.makedirs(download_dir)
print(f"Created directory: {download_dir}")
# --- Read original libgen.rs book page URLs ---
try:
with open(input_filename, 'r') as f:
# Load all URLs as originally intended
libgen_book_page_urls = json.load(f)
print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}")
except FileNotFoundError:
print(f"Error: {input_filename} not found. Please run the initial link scraping part first.")
exit()
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {input_filename}.")
exit()
if not libgen_book_page_urls:
print("No book page URLs found in the file. Exiting.")
exit()
# --- Stage 1: Construct intermediate URLs and get final download links ---
final_download_links = []
intermediate_urls_to_try = []
print("\n--- Constructing Intermediate URLs ---")
# Process all URLs again
for url in libgen_book_page_urls:
try:
parsed_libgen_url = urlparse(url)
query_params = parse_qs(parsed_libgen_url.query)
md5_list = query_params.get('md5')
if md5_list:
md5 = md5_list[0]
intermediate_url = f"http://books.ms/main/{md5}"
intermediate_urls_to_try.append(intermediate_url)
# Maybe remove verbose printing for full run
# print(f"Constructed: {intermediate_url} from {url}")
else:
print(f"Could not extract MD5 from {url}")
except Exception as e:
print(f"Error processing libgen URL {url}: {e}")
print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---")
if intermediate_urls_to_try:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Use the renamed function get_final_download_link
future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try}
for future in as_completed(future_to_url):
intermediate_url = future_to_url[future]
try:
result = future.result()
if result:
final_download_links.append(result)
except Exception as exc:
print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}')
print(f"\nFound {len(final_download_links)} final download links.")
# --- Stage 2: Download, Convert, and Process files concurrently ---
print("\n--- Downloading, Converting, Processing Files ---")
download_process_results = []
if final_download_links:
# Use the new function that handles download, conversion, and upsert trigger
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links}
for future in as_completed(future_to_download):
link = future_to_download[future]
try:
result_message = future.result()
download_process_results.append(result_message)
print(f"Overall Result for {link}: {result_message}")
except Exception as exc:
print(f'Download/Processing {link} generated an exception: {exc}')
download_process_results.append(f"Error (Exception): {link}")
else:
print("No final download links found, skipping download/process stage.")
# --- Final Summary ---
print("\n--- Final Summary ---")
# Analyze the results strings for a more detailed summary (optional)
success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted'))
success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)'))
success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)'))
skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped'))
error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count
print(f"Total final links attempted: {len(final_download_links)}")
print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}")
print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}")
print(f"Skipped Download (e.g., already exists): {skipped_dl_count}")
print(f"Errors (Download/Conversion/Process/Upsert): {error_count}")
print(f"\nDownloads attempted in the '{download_dir}' directory.")
# --- End Main Execution ---