Spaces:

poemsforaphrodite
/

rag-chat

Sleeping

rag-chat / libgen_scraper.py

Enhance .gitignore and improve app.py functionality. Added additional file types to .gitignore for better exclusion. Updated app.py to manage upload progress with Streamlit session state, improved UI layout, and added documentation for user guidance.

7211b51 3 months ago

raw

history blame contribute delete

21.4 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import os
	import time
	import subprocess
	import uuid
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from urllib.parse import urlparse, unquote, parse_qs
	from lxml import html # Import lxml
	from dotenv import load_dotenv
	from pypdf import PdfReader # Use pypdf instead of PyPDF2
	from openai import OpenAI
	from pinecone import Pinecone
	# cssselect is used by lxml's .cssselect() method, ensure it's installed

	# --- Initialization ---
	load_dotenv()

	# Set up OpenAI client
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	# Set up Pinecone
	pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

	index_name = "main" # Your index name
	try:
	index = pc.Index(index_name)
	print(f"Connected to Pinecone index: {index_name}")
	# Optional: Check index stats
	# print(index.describe_index_stats())
	except Exception as e:
	print(f"Error connecting to Pinecone index '{index_name}': {e}")
	print("Please ensure the index exists and API keys are correct.")
	exit()

	# URL provided by the user
	url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1"

	# Headers mimicking a browser request (Removed Host)
	base_headers = {
	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7',
	'accept-encoding': 'gzip, deflate, br, zstd',
	'accept-language': 'en-US,en;q=0.9',
	'connection': 'keep-alive',
	'dnt': '1',
	'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"macOS"',
	'sec-fetch-dest': 'document',
	'sec-fetch-mode': 'navigate',
	'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains
	'sec-fetch-user': '?1',
	'upgrade-insecure-requests': '1',
	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
	}

	print(f"Attempting to fetch: {url}")

	try:
	response = requests.get(url, headers=base_headers)
	response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
	print("Successfully fetched page content.")

	soup = BeautifulSoup(response.text, 'html.parser')

	# Find the main table containing search results (often class='c' in libgen)
	# Inspecting the source, the table seems to be the 3rd table on the page,
	# or more reliably, the one with width="100%" and rules="rows"
	results_table = soup.find('table', {'rules': 'rows', 'width': '100%'})

	book_links = []
	base_url = "https://libgen.rs/"

	if results_table:
	print("Found results table. Processing rows...")
	rows = results_table.find_all('tr')
	print(f"Found {len(rows) - 1} potential book entries (excluding header).")

	# Skip the header row (index 0)
	for row in rows[1:]:
	cells = row.find_all('td')
	# Ensure the row has enough cells (at least 3 for the link)
	if len(cells) > 2:
	link_cell = cells[2] # The third column usually contains the title link
	link_tag = link_cell.find('a')
	if link_tag and link_tag.has_attr('href'):
	relative_link = link_tag['href']
	# Ensure it's a book link (often starts with 'book/')
	if relative_link.startswith('book/'):
	full_link = base_url + relative_link
	book_links.append(full_link)
	else:
	print("Skipping row with insufficient cells.")

	print(f"Extracted {len(book_links)} book links.")

	# Save the links to a JSON file
	output_filename = 'links.json'
	with open(output_filename, 'w') as f:
	json.dump(book_links, f, indent=4)
	print(f"Successfully saved links to {output_filename}")

	else:
	print("Could not find the results table. Check the HTML structure or selectors.")

	except requests.exceptions.RequestException as e:
	print(f"Error fetching URL: {e}")
	except Exception as e:
	print(f"An error occurred: {e}")

	# Known download host patterns (check hostname ENDS WITH these)
	DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc']

	def get_embedding(text):
	try:
	response = client.embeddings.create(input=text, model="text-embedding-3-large")
	return response.data[0].embedding
	except Exception as e:
	print(f"Error getting embedding: {e}")
	return None

	def convert_djvu_to_pdf(djvu_filepath):
	"""Converts a DJVU file to PDF using djvu2pdf command line tool."""
	pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf"
	command = ["djvu2pdf", djvu_filepath, pdf_filepath]
	print(f"Converting {os.path.basename(djvu_filepath)} to PDF...")
	try:
	result = subprocess.run(command, check=True, capture_output=True, text=True)
	print(f"Successfully converted to {os.path.basename(pdf_filepath)}")
	return pdf_filepath
	except FileNotFoundError:
	print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.")
	return None
	except subprocess.CalledProcessError as e:
	print(f"Error during conversion: {e}")
	print(f"Stderr: {e.stderr}")
	return None
	except Exception as e:
	print(f"An unexpected error occurred during conversion: {e}")
	return None

	def process_and_upsert_pdf(pdf_filepath, original_filename):
	"""Reads PDF, chunks text, gets embeddings, and upserts to Pinecone."""
	print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}")
	doc_id = str(uuid.uuid4())
	try:
	reader = PdfReader(pdf_filepath)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text: # Add text only if extraction succeeded
	text += page_text + "\n"

	if not text:
	print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}")
	return f"Processed (No Text): {original_filename}"

	content_length = len(text)
	print(f"Extracted text length: {content_length}")

	# Simple chunking (adjust size as needed)
	chunk_size = 2000
	chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
	print(f"Split into {len(chunks)} chunks.")

	vectors = []
	for i, chunk in enumerate(chunks):
	embedding = get_embedding(chunk)
	if embedding:
	vectors.append((
	f"{doc_id}_{i}",
	embedding,
	{
	"text": chunk,
	"type": "PDF",
	"doc_id": doc_id,
	"doc_name": original_filename, # Store the original filename
	"chunk_index": i
	}
	))
	else:
	print(f"Skipping chunk {i} due to embedding error.")

	if not vectors:
	print("No vectors generated for upsert.")
	return f"Processed (No Vectors): {original_filename}"

	# Upsert in batches if necessary (Pinecone recommends batching)
	batch_size = 100
	for i in range(0, len(vectors), batch_size):
	batch = vectors[i:i+batch_size]
	try:
	index.upsert(vectors=batch)
	print(f"Upserted batch {i//batch_size + 1} to Pinecone.")
	except Exception as e:
	print(f"Error upserting batch to Pinecone: {e}")
	# Decide if you want to stop or continue with other batches

	print(f"Successfully processed and upserted {original_filename} to Pinecone.")
	return f"Upserted: {original_filename}"

	except Exception as e:
	print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}")
	return f"Error (Processing): {original_filename}"

	def get_final_download_link(intermediate_page_url):
	"""Visits an intermediate page (e.g., books.ms/main/HASH)
	and scrapes the final download link using the selector #download a.
	"""
	print(f"Fetching final link from intermediate page: {intermediate_page_url}")
	try:
	# Update Host header for the specific request
	request_headers = base_headers.copy()
	parsed_url = urlparse(intermediate_page_url)
	if parsed_url.netloc:
	request_headers['Host'] = parsed_url.netloc

	response = requests.get(intermediate_page_url, headers=request_headers, timeout=20)
	response.raise_for_status()

	tree = html.fromstring(response.content)
	found_link = None

	# --- Attempt the simple, effective selector ---
	css_selector = "#download a" # Target first anchor within #download
	print(f"Attempting CSS selector: {css_selector}")
	link_elements = tree.cssselect(css_selector)
	if link_elements:
	link_tag = link_elements[0] # Take the first one found
	href = link_tag.get('href')
	if href:
	parsed_href = urlparse(href)
	# Validation:
	if (parsed_href.scheme and parsed_href.netloc and
	'/main/' in parsed_href.path and
	any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)):
	print(f"Found final download link via CSS selector: {href}")
	found_link = href
	else:
	# If the first link doesn't validate, maybe log it but don't proceed
	print(f"Selector '{css_selector}' found link, but failed validation: {href}")
	else:
	print(f"Selector '{css_selector}' found link tag, but it has no href.")
	else:
	print(f"CSS selector {css_selector} did not find any elements.")

	# --- Return result ---
	if found_link:
	return found_link
	else:
	# If no valid link was found after checking the first #download a
	print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}")
	return None

	except requests.exceptions.Timeout:
	print(f"Timeout error fetching intermediate page {intermediate_page_url}")
	return None
	except requests.exceptions.RequestException as e:
	if e.response is not None:
	print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}")
	else:
	print(f"Error fetching intermediate page {intermediate_page_url}: {e}")
	return None
	except Exception as e:
	print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}")
	return None

	def download_file_and_process(download_url, download_dir):
	"""Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert.
	Returns a status message.
	"""
	if not download_url:
	return "Skipped: No download URL provided."

	processing_status = "" # To store the outcome of PDF processing/upserting
	original_filename = "Unknown"
	final_filepath = None # Path to the file to be processed (PDF)
	djvu_filepath_to_delete = None

	try:
	# --- Downloading ---
	parsed_url = urlparse(download_url)
	path_parts = [part for part in parsed_url.path.split('/') if part]
	filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}"
	original_filename = filename_base # Keep original name for metadata

	print(f"Attempting to download: {download_url}")
	response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout
	response.raise_for_status()

	# --- Determine File Path and Extension ---
	content_disposition = response.headers.get('Content-Disposition')
	extension = '.pdf' # Default
	if content_disposition:
	if 'filename=' in content_disposition:
	disp_filename = content_disposition.split('filename=')[-1].strip('"\'')
	if '.' in disp_filename:
	extension = os.path.splitext(disp_filename)[1].lower()
	else:
	# Check extension from URL path if no content-disposition
	if '.' in filename_base:
	url_ext = os.path.splitext(filename_base)[1].lower()
	if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed
	extension = url_ext

	filename = filename_base
	if not filename.lower().endswith(extension):
	filename += extension

	download_filepath = os.path.join(download_dir, filename)

	# --- Save File ---
	if os.path.exists(download_filepath):
	print(f"File already exists: {filename}")
	# Decide if we should still process it for Pinecone
	if download_filepath.lower().endswith('.pdf'):
	final_filepath = download_filepath
	elif download_filepath.lower().endswith('.djvu'):
	# Check if corresponding PDF exists from previous run
	pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf"
	if os.path.exists(pdf_equiv):
	print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}")
	final_filepath = pdf_equiv
	else:
	# Convert existing DJVU
	print("DJVU exists but PDF doesn't. Converting...")
	converted_pdf = convert_djvu_to_pdf(download_filepath)
	if converted_pdf:
	final_filepath = converted_pdf
	else:
	return f"Skipped (Exists, Non-PDF/DJVU): {filename}"
	else:
	print(f"Downloading to: {download_filepath}")
	with open(download_filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	print(f"Successfully downloaded: {filename}")

	# --- Post-Download Processing ---
	if download_filepath.lower().endswith('.pdf'):
	final_filepath = download_filepath
	elif download_filepath.lower().endswith('.djvu'):
	converted_pdf = convert_djvu_to_pdf(download_filepath)
	if converted_pdf:
	final_filepath = converted_pdf
	djvu_filepath_to_delete = download_filepath # Mark original for deletion
	else:
	print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.")
	return f"Success (DL Only): {filename}"

	# --- Pinecone Upsert Trigger ---
	if final_filepath and os.path.exists(final_filepath):
	processing_status = process_and_upsert_pdf(final_filepath, original_filename)

	# Optional: Delete original DJVU after successful conversion and processing
	if djvu_filepath_to_delete and 'Error' not in processing_status:
	try:
	os.remove(djvu_filepath_to_delete)
	print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}")
	except Exception as e:
	print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}")

	else:
	processing_status = "Skipped Upsert (No PDF)"

	return f"Download OK. Status: {processing_status}"

	except requests.exceptions.Timeout:
	print(f"Timeout error downloading {download_url}")
	return f"Error (Timeout): {original_filename}"
	except requests.exceptions.RequestException as e:
	print(f"Error downloading {download_url}: {e}")
	return f"Error (RequestException): {original_filename}"
	except Exception as e:
	print(f"An unexpected error occurred during download/process of {original_filename}: {e}")
	return f"Error (Unexpected): {original_filename}"

	# --- Main Execution ---

	input_filename = 'links.json'
	download_dir = 'downloads'
	max_workers = 3 # Reduce workers slightly due to processing load

	# Create download directory if it doesn't exist
	if not os.path.exists(download_dir):
	os.makedirs(download_dir)
	print(f"Created directory: {download_dir}")

	# --- Read original libgen.rs book page URLs ---
	try:
	with open(input_filename, 'r') as f:
	# Load all URLs as originally intended
	libgen_book_page_urls = json.load(f)
	print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}")

	except FileNotFoundError:
	print(f"Error: {input_filename} not found. Please run the initial link scraping part first.")
	exit()
	except json.JSONDecodeError:
	print(f"Error: Could not decode JSON from {input_filename}.")
	exit()

	if not libgen_book_page_urls:
	print("No book page URLs found in the file. Exiting.")
	exit()

	# --- Stage 1: Construct intermediate URLs and get final download links ---
	final_download_links = []
	intermediate_urls_to_try = []

	print("\n--- Constructing Intermediate URLs ---")
	# Process all URLs again
	for url in libgen_book_page_urls:
	try:
	parsed_libgen_url = urlparse(url)
	query_params = parse_qs(parsed_libgen_url.query)
	md5_list = query_params.get('md5')
	if md5_list:
	md5 = md5_list[0]
	intermediate_url = f"http://books.ms/main/{md5}"
	intermediate_urls_to_try.append(intermediate_url)
	# Maybe remove verbose printing for full run
	# print(f"Constructed: {intermediate_url} from {url}")
	else:
	print(f"Could not extract MD5 from {url}")
	except Exception as e:
	print(f"Error processing libgen URL {url}: {e}")

	print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---")
	if intermediate_urls_to_try:
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	# Use the renamed function get_final_download_link
	future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try}
	for future in as_completed(future_to_url):
	intermediate_url = future_to_url[future]
	try:
	result = future.result()
	if result:
	final_download_links.append(result)
	except Exception as exc:
	print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}')

	print(f"\nFound {len(final_download_links)} final download links.")

	# --- Stage 2: Download, Convert, and Process files concurrently ---
	print("\n--- Downloading, Converting, Processing Files ---")
	download_process_results = []
	if final_download_links:
	# Use the new function that handles download, conversion, and upsert trigger
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links}
	for future in as_completed(future_to_download):
	link = future_to_download[future]
	try:
	result_message = future.result()
	download_process_results.append(result_message)
	print(f"Overall Result for {link}: {result_message}")
	except Exception as exc:
	print(f'Download/Processing {link} generated an exception: {exc}')
	download_process_results.append(f"Error (Exception): {link}")
	else:
	print("No final download links found, skipping download/process stage.")

	# --- Final Summary ---
	print("\n--- Final Summary ---")
	# Analyze the results strings for a more detailed summary (optional)
	success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted'))
	success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)'))
	success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)'))
	skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped'))
	error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count

	print(f"Total final links attempted: {len(final_download_links)}")
	print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}")
	print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}")
	print(f"Skipped Download (e.g., already exists): {skipped_dl_count}")
	print(f"Errors (Download/Conversion/Process/Upsert): {error_count}")

	print(f"\nDownloads attempted in the '{download_dir}' directory.")

	# --- End Main Execution ---