File size: 21,408 Bytes
7211b51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
import requests
from bs4 import BeautifulSoup
import json
import os
import time
import subprocess
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse, unquote, parse_qs
from lxml import html # Import lxml
from dotenv import load_dotenv
from pypdf import PdfReader # Use pypdf instead of PyPDF2
from openai import OpenAI
from pinecone import Pinecone
# cssselect is used by lxml's .cssselect() method, ensure it's installed

# --- Initialization --- 
load_dotenv()

# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Set up Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "main"  # Your index name
try:
    index = pc.Index(index_name)
    print(f"Connected to Pinecone index: {index_name}")
    # Optional: Check index stats
    # print(index.describe_index_stats())
except Exception as e:
    print(f"Error connecting to Pinecone index '{index_name}': {e}")
    print("Please ensure the index exists and API keys are correct.")
    exit()

# URL provided by the user
url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1"

# Headers mimicking a browser request (Removed Host)
base_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-encoding': 'gzip, deflate, br, zstd',
    'accept-language': 'en-US,en;q=0.9',
    'connection': 'keep-alive',
    'dnt': '1',
    'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
}

print(f"Attempting to fetch: {url}")

try:
    response = requests.get(url, headers=base_headers)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    print("Successfully fetched page content.")

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the main table containing search results (often class='c' in libgen)
    # Inspecting the source, the table seems to be the 3rd table on the page, 
    # or more reliably, the one with width="100%" and rules="rows"
    results_table = soup.find('table', {'rules': 'rows', 'width': '100%'}) 

    book_links = []
    base_url = "https://libgen.rs/"

    if results_table:
        print("Found results table. Processing rows...")
        rows = results_table.find_all('tr')
        print(f"Found {len(rows) - 1} potential book entries (excluding header).")
        
        # Skip the header row (index 0)
        for row in rows[1:]:
            cells = row.find_all('td')
            # Ensure the row has enough cells (at least 3 for the link)
            if len(cells) > 2: 
                link_cell = cells[2] # The third column usually contains the title link
                link_tag = link_cell.find('a')
                if link_tag and link_tag.has_attr('href'):
                    relative_link = link_tag['href']
                    # Ensure it's a book link (often starts with 'book/')
                    if relative_link.startswith('book/'):
                        full_link = base_url + relative_link
                        book_links.append(full_link)
            else:
                print("Skipping row with insufficient cells.")

        print(f"Extracted {len(book_links)} book links.")

        # Save the links to a JSON file
        output_filename = 'links.json'
        with open(output_filename, 'w') as f:
            json.dump(book_links, f, indent=4)
        print(f"Successfully saved links to {output_filename}")

    else:
        print("Could not find the results table. Check the HTML structure or selectors.")

except requests.exceptions.RequestException as e:
    print(f"Error fetching URL: {e}")
except Exception as e:
    print(f"An error occurred: {e}")

# Known download host patterns (check hostname ENDS WITH these)
DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc'] 

def get_embedding(text):
    try:
        response = client.embeddings.create(input=text, model="text-embedding-3-large")
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None

def convert_djvu_to_pdf(djvu_filepath):
    """Converts a DJVU file to PDF using djvu2pdf command line tool."""
    pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf"
    command = ["djvu2pdf", djvu_filepath, pdf_filepath]
    print(f"Converting {os.path.basename(djvu_filepath)} to PDF...")
    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print(f"Successfully converted to {os.path.basename(pdf_filepath)}")
        return pdf_filepath
    except FileNotFoundError:
        print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.")
        return None
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")
        print(f"Stderr: {e.stderr}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during conversion: {e}")
        return None

def process_and_upsert_pdf(pdf_filepath, original_filename):
    """Reads PDF, chunks text, gets embeddings, and upserts to Pinecone."""
    print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}")
    doc_id = str(uuid.uuid4())
    try:
        reader = PdfReader(pdf_filepath)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text: # Add text only if extraction succeeded
                text += page_text + "\n"
        
        if not text:
            print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}")
            return f"Processed (No Text): {original_filename}"

        content_length = len(text)
        print(f"Extracted text length: {content_length}")
        
        # Simple chunking (adjust size as needed)
        chunk_size = 2000
        chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
        print(f"Split into {len(chunks)} chunks.")

        vectors = []
        for i, chunk in enumerate(chunks):
            embedding = get_embedding(chunk)
            if embedding:
                vectors.append((
                    f"{doc_id}_{i}", 
                    embedding, 
                    {
                        "text": chunk, 
                        "type": "PDF",
                        "doc_id": doc_id,
                        "doc_name": original_filename, # Store the original filename
                        "chunk_index": i
                    }
                ))
            else:
                print(f"Skipping chunk {i} due to embedding error.")

        if not vectors:
            print("No vectors generated for upsert.")
            return f"Processed (No Vectors): {original_filename}"
        
        # Upsert in batches if necessary (Pinecone recommends batching)
        batch_size = 100 
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i+batch_size]
            try:
                index.upsert(vectors=batch)
                print(f"Upserted batch {i//batch_size + 1} to Pinecone.")
            except Exception as e:
                print(f"Error upserting batch to Pinecone: {e}")
                # Decide if you want to stop or continue with other batches

        print(f"Successfully processed and upserted {original_filename} to Pinecone.")
        return f"Upserted: {original_filename}"

    except Exception as e:
        print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}")
        return f"Error (Processing): {original_filename}"

def get_final_download_link(intermediate_page_url):
    """Visits an intermediate page (e.g., books.ms/main/HASH)
    and scrapes the final download link using the selector #download a.
    """
    print(f"Fetching final link from intermediate page: {intermediate_page_url}")
    try:
        # Update Host header for the specific request
        request_headers = base_headers.copy()
        parsed_url = urlparse(intermediate_page_url)
        if parsed_url.netloc:
             request_headers['Host'] = parsed_url.netloc

        response = requests.get(intermediate_page_url, headers=request_headers, timeout=20)
        response.raise_for_status()
        
        tree = html.fromstring(response.content)
        found_link = None

        # --- Attempt the simple, effective selector --- 
        css_selector = "#download a" # Target first anchor within #download
        print(f"Attempting CSS selector: {css_selector}")
        link_elements = tree.cssselect(css_selector)
        if link_elements:
            link_tag = link_elements[0] # Take the first one found
            href = link_tag.get('href')
            if href:
                parsed_href = urlparse(href)
                # Validation:
                if (parsed_href.scheme and parsed_href.netloc and 
                    '/main/' in parsed_href.path and 
                    any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)):
                     print(f"Found final download link via CSS selector: {href}")
                     found_link = href
                else:
                    # If the first link doesn't validate, maybe log it but don't proceed
                    print(f"Selector '{css_selector}' found link, but failed validation: {href}") 
            else:
                 print(f"Selector '{css_selector}' found link tag, but it has no href.")
        else:
             print(f"CSS selector {css_selector} did not find any elements.")
             
        # --- Return result --- 
        if found_link:
            return found_link
        else:
            # If no valid link was found after checking the first #download a
            print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}")
            return None
            
    except requests.exceptions.Timeout:
        print(f"Timeout error fetching intermediate page {intermediate_page_url}")
        return None
    except requests.exceptions.RequestException as e:
        if e.response is not None:
            print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}")
        else:
             print(f"Error fetching intermediate page {intermediate_page_url}: {e}")
        return None
    except Exception as e:
        print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}")
        return None

def download_file_and_process(download_url, download_dir):
    """Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert.
       Returns a status message.
    """
    if not download_url:
        return "Skipped: No download URL provided."

    processing_status = "" # To store the outcome of PDF processing/upserting
    original_filename = "Unknown"
    final_filepath = None # Path to the file to be processed (PDF)
    djvu_filepath_to_delete = None

    try:
        # --- Downloading --- 
        parsed_url = urlparse(download_url)
        path_parts = [part for part in parsed_url.path.split('/') if part]
        filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}"
        original_filename = filename_base # Keep original name for metadata
        
        print(f"Attempting to download: {download_url}")
        response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout
        response.raise_for_status()

        # --- Determine File Path and Extension --- 
        content_disposition = response.headers.get('Content-Disposition')
        extension = '.pdf' # Default
        if content_disposition:
            if 'filename=' in content_disposition:
                disp_filename = content_disposition.split('filename=')[-1].strip('"\'')
                if '.' in disp_filename:
                    extension = os.path.splitext(disp_filename)[1].lower()
        else:
             # Check extension from URL path if no content-disposition
             if '.' in filename_base:
                  url_ext = os.path.splitext(filename_base)[1].lower()
                  if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed
                      extension = url_ext

        filename = filename_base
        if not filename.lower().endswith(extension):
             filename += extension
             
        download_filepath = os.path.join(download_dir, filename)

        # --- Save File --- 
        if os.path.exists(download_filepath):
            print(f"File already exists: {filename}")
            # Decide if we should still process it for Pinecone
            if download_filepath.lower().endswith('.pdf'):
                 final_filepath = download_filepath
            elif download_filepath.lower().endswith('.djvu'):
                 # Check if corresponding PDF exists from previous run
                 pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf"
                 if os.path.exists(pdf_equiv):
                      print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}")
                      final_filepath = pdf_equiv
                 else:
                      # Convert existing DJVU
                      print("DJVU exists but PDF doesn't. Converting...")
                      converted_pdf = convert_djvu_to_pdf(download_filepath)
                      if converted_pdf:
                           final_filepath = converted_pdf
            else:
                 return f"Skipped (Exists, Non-PDF/DJVU): {filename}"
        else:
            print(f"Downloading to: {download_filepath}")
            with open(download_filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"Successfully downloaded: {filename}")
            
            # --- Post-Download Processing --- 
            if download_filepath.lower().endswith('.pdf'):
                final_filepath = download_filepath
            elif download_filepath.lower().endswith('.djvu'):
                converted_pdf = convert_djvu_to_pdf(download_filepath)
                if converted_pdf:
                    final_filepath = converted_pdf
                    djvu_filepath_to_delete = download_filepath # Mark original for deletion
            else:
                 print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.")
                 return f"Success (DL Only): {filename}"

        # --- Pinecone Upsert Trigger --- 
        if final_filepath and os.path.exists(final_filepath): 
            processing_status = process_and_upsert_pdf(final_filepath, original_filename)
            
            # Optional: Delete original DJVU after successful conversion and processing
            if djvu_filepath_to_delete and 'Error' not in processing_status:
                try:
                    os.remove(djvu_filepath_to_delete)
                    print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}")
                except Exception as e:
                    print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}")

        else:
             processing_status = "Skipped Upsert (No PDF)"
             
        return f"Download OK. Status: {processing_status}"

    except requests.exceptions.Timeout:
        print(f"Timeout error downloading {download_url}")
        return f"Error (Timeout): {original_filename}"
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {download_url}: {e}")
        return f"Error (RequestException): {original_filename}"
    except Exception as e:
        print(f"An unexpected error occurred during download/process of {original_filename}: {e}")
        return f"Error (Unexpected): {original_filename}"

# --- Main Execution --- 

input_filename = 'links.json' 
download_dir = 'downloads'
max_workers = 3 # Reduce workers slightly due to processing load

# Create download directory if it doesn't exist
if not os.path.exists(download_dir):
    os.makedirs(download_dir)
    print(f"Created directory: {download_dir}")

# --- Read original libgen.rs book page URLs --- 
try:
    with open(input_filename, 'r') as f:
        # Load all URLs as originally intended
        libgen_book_page_urls = json.load(f)
    print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}")
    
except FileNotFoundError:
    print(f"Error: {input_filename} not found. Please run the initial link scraping part first.")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {input_filename}.")
    exit()

if not libgen_book_page_urls: 
    print("No book page URLs found in the file. Exiting.")
    exit()

# --- Stage 1: Construct intermediate URLs and get final download links --- 
final_download_links = []
intermediate_urls_to_try = []

print("\n--- Constructing Intermediate URLs ---")
# Process all URLs again
for url in libgen_book_page_urls:
    try:
        parsed_libgen_url = urlparse(url)
        query_params = parse_qs(parsed_libgen_url.query)
        md5_list = query_params.get('md5')
        if md5_list:
            md5 = md5_list[0]
            intermediate_url = f"http://books.ms/main/{md5}"
            intermediate_urls_to_try.append(intermediate_url)
            # Maybe remove verbose printing for full run
            # print(f"Constructed: {intermediate_url} from {url}") 
        else:
            print(f"Could not extract MD5 from {url}")
    except Exception as e:
        print(f"Error processing libgen URL {url}: {e}")

print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---")
if intermediate_urls_to_try:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Use the renamed function get_final_download_link
        future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try}
        for future in as_completed(future_to_url):
            intermediate_url = future_to_url[future]
            try:
                result = future.result()
                if result:
                    final_download_links.append(result)
            except Exception as exc:
                print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}')

print(f"\nFound {len(final_download_links)} final download links.")

# --- Stage 2: Download, Convert, and Process files concurrently --- 
print("\n--- Downloading, Converting, Processing Files ---")
download_process_results = []
if final_download_links:
    # Use the new function that handles download, conversion, and upsert trigger
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links}
        for future in as_completed(future_to_download):
            link = future_to_download[future]
            try:
                result_message = future.result()
                download_process_results.append(result_message)
                print(f"Overall Result for {link}: {result_message}") 
            except Exception as exc:
                print(f'Download/Processing {link} generated an exception: {exc}')
                download_process_results.append(f"Error (Exception): {link}")
else:
    print("No final download links found, skipping download/process stage.")

# --- Final Summary --- 
print("\n--- Final Summary ---")
# Analyze the results strings for a more detailed summary (optional)
success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted'))
success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)'))
success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)'))
skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped'))
error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count

print(f"Total final links attempted: {len(final_download_links)}")
print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}")
print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}")
print(f"Skipped Download (e.g., already exists): {skipped_dl_count}")
print(f"Errors (Download/Conversion/Process/Upsert): {error_count}")

print(f"\nDownloads attempted in the '{download_dir}' directory.")

# --- End Main Execution ---