Spaces:

poemsforaphrodite
/

rag-chat

Sleeping

File size: 21,408 Bytes

7211b51

import requests
from bs4 import BeautifulSoup
import json
import os
import time
import subprocess
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse, unquote, parse_qs
from lxml import html # Import lxml
from dotenv import load_dotenv
from pypdf import PdfReader # Use pypdf instead of PyPDF2
from openai import OpenAI
from pinecone import Pinecone
# cssselect is used by lxml's .cssselect() method, ensure it's installed

# --- Initialization --- 
load_dotenv()

# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Set up Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "main"  # Your index name
try:
    index = pc.Index(index_name)
    print(f"Connected to Pinecone index: {index_name}")
    # Optional: Check index stats
    # print(index.describe_index_stats())
except Exception as e:
    print(f"Error connecting to Pinecone index '{index_name}': {e}")
    print("Please ensure the index exists and API keys are correct.")
    exit()

# URL provided by the user
url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1"

# Headers mimicking a browser request (Removed Host)
base_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-encoding': 'gzip, deflate, br, zstd',
    'accept-language': 'en-US,en;q=0.9',
    'connection': 'keep-alive',
    'dnt': '1',
    'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
}

print(f"Attempting to fetch: {url}")

try:
    response = requests.get(url, headers=base_headers)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    print("Successfully fetched page content.")

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the main table containing search results (often class='c' in libgen)
    # Inspecting the source, the table seems to be the 3rd table on the page, 
    # or more reliably, the one with width="100%" and rules="rows"
    results_table = soup.find('table', {'rules': 'rows', 'width': '100%'}) 

    book_links = []
    base_url = "https://libgen.rs/"

    if results_table:
        print("Found results table. Processing rows...")
        rows = results_table.find_all('tr')
        print(f"Found {len(rows) - 1} potential book entries (excluding header).")
        
        # Skip the header row (index 0)
        for row in rows[1:]:
            cells = row.find_all('td')
            # Ensure the row has enough cells (at least 3 for the link)
            if len(cells) > 2: 
                link_cell = cells[2] # The third column usually contains the title link
                link_tag = link_cell.find('a')
                if link_tag and link_tag.has_attr('href'):
                    relative_link = link_tag['href']
                    # Ensure it's a book link (often starts with 'book/')
                    if relative_link.startswith('book/'):
                        full_link = base_url + relative_link
                        book_links.append(full_link)
            else:
                print("Skipping row with insufficient cells.")

        print(f"Extracted {len(book_links)} book links.")

        # Save the links to a JSON file
        output_filename = 'links.json'
        with open(output_filename, 'w') as f:
            json.dump(book_links, f, indent=4)
        print(f"Successfully saved links to {output_filename}")

    else:
        print("Could not find the results table. Check the HTML structure or selectors.")

except requests.exceptions.RequestException as e:
    print(f"Error fetching URL: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

# Known download host patterns (check hostname ENDS WITH these)
DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc'] 

def get_embedding(text):
    try:
        response = client.embeddings.create(input=text, model="text-embedding-3-large")
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None

def convert_djvu_to_pdf(djvu_filepath):
    """Converts a DJVU file to PDF using djvu2pdf command line tool."""
    pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf"
    command = ["djvu2pdf", djvu_filepath, pdf_filepath]
    print(f"Converting {os.path.basename(djvu_filepath)} to PDF...")
    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print(f"Successfully converted to {os.path.basename(pdf_filepath)}")
        return pdf_filepath
    except FileNotFoundError:
        print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.")
        return None
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")
        print(f"Stderr: {e.stderr}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during conversion: {e}")
        return None

def process_and_upsert_pdf(pdf_filepath, original_filename):
    """Reads PDF, chunks text, gets embeddings, and upserts to Pinecone."""
    print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}")
    doc_id = str(uuid.uuid4())
    try:
        reader = PdfReader(pdf_filepath)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text: # Add text only if extraction succeeded
                text += page_text + "\n"
        
        if not text:
            print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}")
            return f"Processed (No Text): {original_filename}"

        content_length = len(text)
        print(f"Extracted text length: {content_length}")
        
        # Simple chunking (adjust size as needed)
        chunk_size = 2000
        chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
        print(f"Split into {len(chunks)} chunks.")

        vectors = []
        for i, chunk in enumerate(chunks):
            embedding = get_embedding(chunk)
            if embedding:
                vectors.append((
                    f"{doc_id}_{i}", 
                    embedding, 
                    {
                        "text": chunk, 
                        "type": "PDF",
                        "doc_id": doc_id,
                        "doc_name": original_filename, # Store the original filename
                        "chunk_index": i
                    }
                ))
            else:
                print(f"Skipping chunk {i} due to embedding error.")

        if not vectors:
            print("No vectors generated for upsert.")
            return f"Processed (No Vectors): {original_filename}"
        
        # Upsert in batches if necessary (Pinecone recommends batching)
        batch_size = 100 
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i+batch_size]
            try:
                index.upsert(vectors=batch)
                print(f"Upserted batch {i//batch_size + 1} to Pinecone.")
            except Exception as e:
                print(f"Error upserting batch to Pinecone: {e}")
                # Decide if you want to stop or continue with other batches

        print(f"Successfully processed and upserted {original_filename} to Pinecone.")
        return f"Upserted: {original_filename}"

    except Exception as e:
        print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}")
        return f"Error (Processing): {original_filename}"

def get_final_download_link(intermediate_page_url):
    """Visits an intermediate page (e.g., books.ms/main/HASH)
    and scrapes the final download link using the selector #download a.
    """
    print(f"Fetching final link from intermediate page: {intermediate_page_url}")
    try:
        # Update Host header for the specific request
        request_headers = base_headers.copy()
        parsed_url = urlparse(intermediate_page_url)
        if parsed_url.netloc:
             request_headers['Host'] = parsed_url.netloc

        response = requests.get(intermediate_page_url, headers=request_headers, timeout=20)
        response.raise_for_status()
        
        tree = html.fromstring(response.content)
        found_link = None

        # --- Attempt the simple, effective selector --- 
        css_selector = "#download a" # Target first anchor within #download
        print(f"Attempting CSS selector: {css_selector}")
        link_elements = tree.cssselect(css_selector)
        if link_elements:
            link_tag = link_elements[0] # Take the first one found
            href = link_tag.get('href')
            if href:
                parsed_href = urlparse(href)
                # Validation:
                if (parsed_href.scheme and parsed_href.netloc and 
                    '/main/' in parsed_href.path and 
                    any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)):
                     print(f"Found final download link via CSS selector: {href}")
                     found_link = href
                else:
                    # If the first link doesn't validate, maybe log it but don't proceed
                    print(f"Selector '{css_selector}' found link, but failed validation: {href}") 
            else:
                 print(f"Selector '{css_selector}' found link tag, but it has no href.")
        else:
             print(f"CSS selector {css_selector} did not find any elements.")
             
        # --- Return result --- 
        if found_link:
            return found_link
        else:
            # If no valid link was found after checking the first #download a
            print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}")
            return None
            
    except requests.exceptions.Timeout:
        print(f"Timeout error fetching intermediate page {intermediate_page_url}")
        return None
    except requests.exceptions.RequestException as e:
        if e.response is not None:
            print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}")
        else:
             print(f"Error fetching intermediate page {intermediate_page_url}: {e}")
        return None
    except Exception as e:
        print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}")
        return None

def download_file_and_process(download_url, download_dir):
    """Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert.
       Returns a status message.
    """
    if not download_url:
        return "Skipped: No download URL provided."

    processing_status = "" # To store the outcome of PDF processing/upserting
    original_filename = "Unknown"
    final_filepath = None # Path to the file to be processed (PDF)
    djvu_filepath_to_delete = None

    try:
        # --- Downloading --- 
        parsed_url = urlparse(download_url)
        path_parts = [part for part in parsed_url.path.split('/') if part]
        filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}"
        original_filename = filename_base # Keep original name for metadata
        
        print(f"Attempting to download: {download_url}")
        response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout
        response.raise_for_status()

        # --- Determine File Path and Extension --- 
        content_disposition = response.headers.get('Content-Disposition')
        extension = '.pdf' # Default
        if content_disposition:
            if 'filename=' in content_disposition:
                disp_filename = content_disposition.split('filename=')[-1].strip('"\'')
                if '.' in disp_filename:
                    extension = os.path.splitext(disp_filename)[1].lower()
        else:
             # Check extension from URL path if no content-disposition
             if '.' in filename_base:
                  url_ext = os.path.splitext(filename_base)[1].lower()
                  if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed
                      extension = url_ext

        filename = filename_base
        if not filename.lower().endswith(extension):
             filename += extension
             
        download_filepath = os.path.join(download_dir, filename)

        # --- Save File --- 
        if os.path.exists(download_filepath):
            print(f"File already exists: {filename}")
            # Decide if we should still process it for Pinecone
            if download_filepath.lower().endswith('.pdf'):
                 final_filepath = download_filepath
            elif download_filepath.lower().endswith('.djvu'):
                 # Check if corresponding PDF exists from previous run
                 pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf"
                 if os.path.exists(pdf_equiv):
                      print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}")
                      final_filepath = pdf_equiv
                 else:
                      # Convert existing DJVU
                      print("DJVU exists but PDF doesn't. Converting...")
                      converted_pdf = convert_djvu_to_pdf(download_filepath)
                      if converted_pdf:
                           final_filepath = converted_pdf
            else:
                 return f"Skipped (Exists, Non-PDF/DJVU): {filename}"
        else:
            print(f"Downloading to: {download_filepath}")
            with open(download_filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Successfully downloaded: {filename}")
            
            # --- Post-Download Processing --- 
            if download_filepath.lower().endswith('.pdf'):
                final_filepath = download_filepath
            elif download_filepath.lower().endswith('.djvu'):
                converted_pdf = convert_djvu_to_pdf(download_filepath)
                if converted_pdf:
                    final_filepath = converted_pdf
                    djvu_filepath_to_delete = download_filepath # Mark original for deletion
            else:
                 print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.")
                 return f"Success (DL Only): {filename}"

        # --- Pinecone Upsert Trigger --- 
        if final_filepath and os.path.exists(final_filepath): 
            processing_status = process_and_upsert_pdf(final_filepath, original_filename)
            
            # Optional: Delete original DJVU after successful conversion and processing
            if djvu_filepath_to_delete and 'Error' not in processing_status:
                try:
                    os.remove(djvu_filepath_to_delete)
                    print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}")
                except Exception as e:
                    print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}")

        else:
             processing_status = "Skipped Upsert (No PDF)"
             
        return f"Download OK. Status: {processing_status}"

    except requests.exceptions.Timeout:
        print(f"Timeout error downloading {download_url}")
        return f"Error (Timeout): {original_filename}"
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {download_url}: {e}")
        return f"Error (RequestException): {original_filename}"
    except Exception as e:
        print(f"An unexpected error occurred during download/process of {original_filename}: {e}")
        return f"Error (Unexpected): {original_filename}"

# --- Main Execution --- 

input_filename = 'links.json' 
download_dir = 'downloads'
max_workers = 3 # Reduce workers slightly due to processing load

# Create download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    print(f"Created directory: {download_dir}")

# --- Read original libgen.rs book page URLs --- 
try:
    with open(input_filename, 'r') as f:
        # Load all URLs as originally intended
        libgen_book_page_urls = json.load(f)
    print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}")
    
except FileNotFoundError:
    print(f"Error: {input_filename} not found. Please run the initial link scraping part first.")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {input_filename}.")
    exit()

if not libgen_book_page_urls: 
    print("No book page URLs found in the file. Exiting.")
    exit()

# --- Stage 1: Construct intermediate URLs and get final download links --- 
final_download_links = []
intermediate_urls_to_try = []

print("\n--- Constructing Intermediate URLs ---")
# Process all URLs again
for url in libgen_book_page_urls:
    try:
        parsed_libgen_url = urlparse(url)
        query_params = parse_qs(parsed_libgen_url.query)
        md5_list = query_params.get('md5')
        if md5_list:
            md5 = md5_list[0]
            intermediate_url = f"http://books.ms/main/{md5}"
            intermediate_urls_to_try.append(intermediate_url)
            # Maybe remove verbose printing for full run
            # print(f"Constructed: {intermediate_url} from {url}") 
        else:
            print(f"Could not extract MD5 from {url}")
    except Exception as e:
        print(f"Error processing libgen URL {url}: {e}")

print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---")
if intermediate_urls_to_try:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Use the renamed function get_final_download_link
        future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try}
        for future in as_completed(future_to_url):
            intermediate_url = future_to_url[future]
            try:
                result = future.result()
                if result:
                    final_download_links.append(result)
            except Exception as exc:
                print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}')

print(f"\nFound {len(final_download_links)} final download links.")

# --- Stage 2: Download, Convert, and Process files concurrently --- 
print("\n--- Downloading, Converting, Processing Files ---")
download_process_results = []
if final_download_links:
    # Use the new function that handles download, conversion, and upsert trigger
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links}
        for future in as_completed(future_to_download):
            link = future_to_download[future]
            try:
                result_message = future.result()
                download_process_results.append(result_message)
                print(f"Overall Result for {link}: {result_message}") 
            except Exception as exc:
                print(f'Download/Processing {link} generated an exception: {exc}')
                download_process_results.append(f"Error (Exception): {link}")
else:
    print("No final download links found, skipping download/process stage.")

# --- Final Summary --- 
print("\n--- Final Summary ---")
# Analyze the results strings for a more detailed summary (optional)
success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted'))
success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)'))
success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)'))
skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped'))
error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count

print(f"Total final links attempted: {len(final_download_links)}")
print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}")
print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}")
print(f"Skipped Download (e.g., already exists): {skipped_dl_count}")
print(f"Errors (Download/Conversion/Process/Upsert): {error_count}")

print(f"\nDownloads attempted in the '{download_dir}' directory.")

# --- End Main Execution ---