import requests from bs4 import BeautifulSoup import json import os import time import subprocess import uuid from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urlparse, unquote, parse_qs from lxml import html # Import lxml from dotenv import load_dotenv from pypdf import PdfReader # Use pypdf instead of PyPDF2 from openai import OpenAI from pinecone import Pinecone # cssselect is used by lxml's .cssselect() method, ensure it's installed # --- Initialization --- load_dotenv() # Set up OpenAI client client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # Set up Pinecone pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) index_name = "main" # Your index name try: index = pc.Index(index_name) print(f"Connected to Pinecone index: {index_name}") # Optional: Check index stats # print(index.describe_index_stats()) except Exception as e: print(f"Error connecting to Pinecone index '{index_name}': {e}") print("Please ensure the index exists and API keys are correct.") exit() # URL provided by the user url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1" # Headers mimicking a browser request (Removed Host) base_headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-encoding': 'gzip, deflate, br, zstd', 'accept-language': 'en-US,en;q=0.9', 'connection': 'keep-alive', 'dnt': '1', 'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', } print(f"Attempting to fetch: {url}") try: response = requests.get(url, headers=base_headers) response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) print("Successfully fetched page content.") soup = BeautifulSoup(response.text, 'html.parser') # Find the main table containing search results (often class='c' in libgen) # Inspecting the source, the table seems to be the 3rd table on the page, # or more reliably, the one with width="100%" and rules="rows" results_table = soup.find('table', {'rules': 'rows', 'width': '100%'}) book_links = [] base_url = "https://libgen.rs/" if results_table: print("Found results table. Processing rows...") rows = results_table.find_all('tr') print(f"Found {len(rows) - 1} potential book entries (excluding header).") # Skip the header row (index 0) for row in rows[1:]: cells = row.find_all('td') # Ensure the row has enough cells (at least 3 for the link) if len(cells) > 2: link_cell = cells[2] # The third column usually contains the title link link_tag = link_cell.find('a') if link_tag and link_tag.has_attr('href'): relative_link = link_tag['href'] # Ensure it's a book link (often starts with 'book/') if relative_link.startswith('book/'): full_link = base_url + relative_link book_links.append(full_link) else: print("Skipping row with insufficient cells.") print(f"Extracted {len(book_links)} book links.") # Save the links to a JSON file output_filename = 'links.json' with open(output_filename, 'w') as f: json.dump(book_links, f, indent=4) print(f"Successfully saved links to {output_filename}") else: print("Could not find the results table. Check the HTML structure or selectors.") except requests.exceptions.RequestException as e: print(f"Error fetching URL: {e}") except Exception as e: print(f"An error occurred: {e}") # Known download host patterns (check hostname ENDS WITH these) DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc'] def get_embedding(text): try: response = client.embeddings.create(input=text, model="text-embedding-3-large") return response.data[0].embedding except Exception as e: print(f"Error getting embedding: {e}") return None def convert_djvu_to_pdf(djvu_filepath): """Converts a DJVU file to PDF using djvu2pdf command line tool.""" pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf" command = ["djvu2pdf", djvu_filepath, pdf_filepath] print(f"Converting {os.path.basename(djvu_filepath)} to PDF...") try: result = subprocess.run(command, check=True, capture_output=True, text=True) print(f"Successfully converted to {os.path.basename(pdf_filepath)}") return pdf_filepath except FileNotFoundError: print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.") return None except subprocess.CalledProcessError as e: print(f"Error during conversion: {e}") print(f"Stderr: {e.stderr}") return None except Exception as e: print(f"An unexpected error occurred during conversion: {e}") return None def process_and_upsert_pdf(pdf_filepath, original_filename): """Reads PDF, chunks text, gets embeddings, and upserts to Pinecone.""" print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}") doc_id = str(uuid.uuid4()) try: reader = PdfReader(pdf_filepath) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: # Add text only if extraction succeeded text += page_text + "\n" if not text: print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}") return f"Processed (No Text): {original_filename}" content_length = len(text) print(f"Extracted text length: {content_length}") # Simple chunking (adjust size as needed) chunk_size = 2000 chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)] print(f"Split into {len(chunks)} chunks.") vectors = [] for i, chunk in enumerate(chunks): embedding = get_embedding(chunk) if embedding: vectors.append(( f"{doc_id}_{i}", embedding, { "text": chunk, "type": "PDF", "doc_id": doc_id, "doc_name": original_filename, # Store the original filename "chunk_index": i } )) else: print(f"Skipping chunk {i} due to embedding error.") if not vectors: print("No vectors generated for upsert.") return f"Processed (No Vectors): {original_filename}" # Upsert in batches if necessary (Pinecone recommends batching) batch_size = 100 for i in range(0, len(vectors), batch_size): batch = vectors[i:i+batch_size] try: index.upsert(vectors=batch) print(f"Upserted batch {i//batch_size + 1} to Pinecone.") except Exception as e: print(f"Error upserting batch to Pinecone: {e}") # Decide if you want to stop or continue with other batches print(f"Successfully processed and upserted {original_filename} to Pinecone.") return f"Upserted: {original_filename}" except Exception as e: print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}") return f"Error (Processing): {original_filename}" def get_final_download_link(intermediate_page_url): """Visits an intermediate page (e.g., books.ms/main/HASH) and scrapes the final download link using the selector #download a. """ print(f"Fetching final link from intermediate page: {intermediate_page_url}") try: # Update Host header for the specific request request_headers = base_headers.copy() parsed_url = urlparse(intermediate_page_url) if parsed_url.netloc: request_headers['Host'] = parsed_url.netloc response = requests.get(intermediate_page_url, headers=request_headers, timeout=20) response.raise_for_status() tree = html.fromstring(response.content) found_link = None # --- Attempt the simple, effective selector --- css_selector = "#download a" # Target first anchor within #download print(f"Attempting CSS selector: {css_selector}") link_elements = tree.cssselect(css_selector) if link_elements: link_tag = link_elements[0] # Take the first one found href = link_tag.get('href') if href: parsed_href = urlparse(href) # Validation: if (parsed_href.scheme and parsed_href.netloc and '/main/' in parsed_href.path and any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)): print(f"Found final download link via CSS selector: {href}") found_link = href else: # If the first link doesn't validate, maybe log it but don't proceed print(f"Selector '{css_selector}' found link, but failed validation: {href}") else: print(f"Selector '{css_selector}' found link tag, but it has no href.") else: print(f"CSS selector {css_selector} did not find any elements.") # --- Return result --- if found_link: return found_link else: # If no valid link was found after checking the first #download a print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}") return None except requests.exceptions.Timeout: print(f"Timeout error fetching intermediate page {intermediate_page_url}") return None except requests.exceptions.RequestException as e: if e.response is not None: print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}") else: print(f"Error fetching intermediate page {intermediate_page_url}: {e}") return None except Exception as e: print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}") return None def download_file_and_process(download_url, download_dir): """Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert. Returns a status message. """ if not download_url: return "Skipped: No download URL provided." processing_status = "" # To store the outcome of PDF processing/upserting original_filename = "Unknown" final_filepath = None # Path to the file to be processed (PDF) djvu_filepath_to_delete = None try: # --- Downloading --- parsed_url = urlparse(download_url) path_parts = [part for part in parsed_url.path.split('/') if part] filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}" original_filename = filename_base # Keep original name for metadata print(f"Attempting to download: {download_url}") response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout response.raise_for_status() # --- Determine File Path and Extension --- content_disposition = response.headers.get('Content-Disposition') extension = '.pdf' # Default if content_disposition: if 'filename=' in content_disposition: disp_filename = content_disposition.split('filename=')[-1].strip('"\'') if '.' in disp_filename: extension = os.path.splitext(disp_filename)[1].lower() else: # Check extension from URL path if no content-disposition if '.' in filename_base: url_ext = os.path.splitext(filename_base)[1].lower() if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed extension = url_ext filename = filename_base if not filename.lower().endswith(extension): filename += extension download_filepath = os.path.join(download_dir, filename) # --- Save File --- if os.path.exists(download_filepath): print(f"File already exists: {filename}") # Decide if we should still process it for Pinecone if download_filepath.lower().endswith('.pdf'): final_filepath = download_filepath elif download_filepath.lower().endswith('.djvu'): # Check if corresponding PDF exists from previous run pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf" if os.path.exists(pdf_equiv): print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}") final_filepath = pdf_equiv else: # Convert existing DJVU print("DJVU exists but PDF doesn't. Converting...") converted_pdf = convert_djvu_to_pdf(download_filepath) if converted_pdf: final_filepath = converted_pdf else: return f"Skipped (Exists, Non-PDF/DJVU): {filename}" else: print(f"Downloading to: {download_filepath}") with open(download_filepath, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f"Successfully downloaded: {filename}") # --- Post-Download Processing --- if download_filepath.lower().endswith('.pdf'): final_filepath = download_filepath elif download_filepath.lower().endswith('.djvu'): converted_pdf = convert_djvu_to_pdf(download_filepath) if converted_pdf: final_filepath = converted_pdf djvu_filepath_to_delete = download_filepath # Mark original for deletion else: print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.") return f"Success (DL Only): {filename}" # --- Pinecone Upsert Trigger --- if final_filepath and os.path.exists(final_filepath): processing_status = process_and_upsert_pdf(final_filepath, original_filename) # Optional: Delete original DJVU after successful conversion and processing if djvu_filepath_to_delete and 'Error' not in processing_status: try: os.remove(djvu_filepath_to_delete) print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}") except Exception as e: print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}") else: processing_status = "Skipped Upsert (No PDF)" return f"Download OK. Status: {processing_status}" except requests.exceptions.Timeout: print(f"Timeout error downloading {download_url}") return f"Error (Timeout): {original_filename}" except requests.exceptions.RequestException as e: print(f"Error downloading {download_url}: {e}") return f"Error (RequestException): {original_filename}" except Exception as e: print(f"An unexpected error occurred during download/process of {original_filename}: {e}") return f"Error (Unexpected): {original_filename}" # --- Main Execution --- input_filename = 'links.json' download_dir = 'downloads' max_workers = 3 # Reduce workers slightly due to processing load # Create download directory if it doesn't exist if not os.path.exists(download_dir): os.makedirs(download_dir) print(f"Created directory: {download_dir}") # --- Read original libgen.rs book page URLs --- try: with open(input_filename, 'r') as f: # Load all URLs as originally intended libgen_book_page_urls = json.load(f) print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}") except FileNotFoundError: print(f"Error: {input_filename} not found. Please run the initial link scraping part first.") exit() except json.JSONDecodeError: print(f"Error: Could not decode JSON from {input_filename}.") exit() if not libgen_book_page_urls: print("No book page URLs found in the file. Exiting.") exit() # --- Stage 1: Construct intermediate URLs and get final download links --- final_download_links = [] intermediate_urls_to_try = [] print("\n--- Constructing Intermediate URLs ---") # Process all URLs again for url in libgen_book_page_urls: try: parsed_libgen_url = urlparse(url) query_params = parse_qs(parsed_libgen_url.query) md5_list = query_params.get('md5') if md5_list: md5 = md5_list[0] intermediate_url = f"http://books.ms/main/{md5}" intermediate_urls_to_try.append(intermediate_url) # Maybe remove verbose printing for full run # print(f"Constructed: {intermediate_url} from {url}") else: print(f"Could not extract MD5 from {url}") except Exception as e: print(f"Error processing libgen URL {url}: {e}") print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---") if intermediate_urls_to_try: with ThreadPoolExecutor(max_workers=max_workers) as executor: # Use the renamed function get_final_download_link future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try} for future in as_completed(future_to_url): intermediate_url = future_to_url[future] try: result = future.result() if result: final_download_links.append(result) except Exception as exc: print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}') print(f"\nFound {len(final_download_links)} final download links.") # --- Stage 2: Download, Convert, and Process files concurrently --- print("\n--- Downloading, Converting, Processing Files ---") download_process_results = [] if final_download_links: # Use the new function that handles download, conversion, and upsert trigger with ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links} for future in as_completed(future_to_download): link = future_to_download[future] try: result_message = future.result() download_process_results.append(result_message) print(f"Overall Result for {link}: {result_message}") except Exception as exc: print(f'Download/Processing {link} generated an exception: {exc}') download_process_results.append(f"Error (Exception): {link}") else: print("No final download links found, skipping download/process stage.") # --- Final Summary --- print("\n--- Final Summary ---") # Analyze the results strings for a more detailed summary (optional) success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted')) success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)')) success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)')) skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped')) error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count print(f"Total final links attempted: {len(final_download_links)}") print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}") print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}") print(f"Skipped Download (e.g., already exists): {skipped_dl_count}") print(f"Errors (Download/Conversion/Process/Upsert): {error_count}") print(f"\nDownloads attempted in the '{download_dir}' directory.") # --- End Main Execution ---