Spaces:
Sleeping
Sleeping
Enhance .gitignore and improve app.py functionality. Added additional file types to .gitignore for better exclusion. Updated app.py to manage upload progress with Streamlit session state, improved UI layout, and added documentation for user guidance.
7211b51
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import os | |
import time | |
import subprocess | |
import uuid | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from urllib.parse import urlparse, unquote, parse_qs | |
from lxml import html # Import lxml | |
from dotenv import load_dotenv | |
from pypdf import PdfReader # Use pypdf instead of PyPDF2 | |
from openai import OpenAI | |
from pinecone import Pinecone | |
# cssselect is used by lxml's .cssselect() method, ensure it's installed | |
# --- Initialization --- | |
load_dotenv() | |
# Set up OpenAI client | |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
# Set up Pinecone | |
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) | |
index_name = "main" # Your index name | |
try: | |
index = pc.Index(index_name) | |
print(f"Connected to Pinecone index: {index_name}") | |
# Optional: Check index stats | |
# print(index.describe_index_stats()) | |
except Exception as e: | |
print(f"Error connecting to Pinecone index '{index_name}': {e}") | |
print("Please ensure the index exists and API keys are correct.") | |
exit() | |
# URL provided by the user | |
url = "https://libgen.rs/search.php?&req=topicid147&phrase=1&view=simple&column=topic&sort=def&sortmode=ASC&page=1" | |
# Headers mimicking a browser request (Removed Host) | |
base_headers = { | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
'accept-encoding': 'gzip, deflate, br, zstd', | |
'accept-language': 'en-US,en;q=0.9', | |
'connection': 'keep-alive', | |
'dnt': '1', | |
'sec-ch-ua': '"Not:A-Brand";v="24", "Chromium";v="134"', | |
'sec-ch-ua-mobile': '?0', | |
'sec-ch-ua-platform': '"macOS"', | |
'sec-fetch-dest': 'document', | |
'sec-fetch-mode': 'navigate', | |
'sec-fetch-site': 'same-origin', # May need adjustment if navigating between domains | |
'sec-fetch-user': '?1', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', | |
} | |
print(f"Attempting to fetch: {url}") | |
try: | |
response = requests.get(url, headers=base_headers) | |
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) | |
print("Successfully fetched page content.") | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find the main table containing search results (often class='c' in libgen) | |
# Inspecting the source, the table seems to be the 3rd table on the page, | |
# or more reliably, the one with width="100%" and rules="rows" | |
results_table = soup.find('table', {'rules': 'rows', 'width': '100%'}) | |
book_links = [] | |
base_url = "https://libgen.rs/" | |
if results_table: | |
print("Found results table. Processing rows...") | |
rows = results_table.find_all('tr') | |
print(f"Found {len(rows) - 1} potential book entries (excluding header).") | |
# Skip the header row (index 0) | |
for row in rows[1:]: | |
cells = row.find_all('td') | |
# Ensure the row has enough cells (at least 3 for the link) | |
if len(cells) > 2: | |
link_cell = cells[2] # The third column usually contains the title link | |
link_tag = link_cell.find('a') | |
if link_tag and link_tag.has_attr('href'): | |
relative_link = link_tag['href'] | |
# Ensure it's a book link (often starts with 'book/') | |
if relative_link.startswith('book/'): | |
full_link = base_url + relative_link | |
book_links.append(full_link) | |
else: | |
print("Skipping row with insufficient cells.") | |
print(f"Extracted {len(book_links)} book links.") | |
# Save the links to a JSON file | |
output_filename = 'links.json' | |
with open(output_filename, 'w') as f: | |
json.dump(book_links, f, indent=4) | |
print(f"Successfully saved links to {output_filename}") | |
else: | |
print("Could not find the results table. Check the HTML structure or selectors.") | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching URL: {e}") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
# Known download host patterns (check hostname ENDS WITH these) | |
DOWNLOAD_HOST_PATTERNS = ['.library.lol', '.books.ms', '.libstc.cc'] | |
def get_embedding(text): | |
try: | |
response = client.embeddings.create(input=text, model="text-embedding-3-large") | |
return response.data[0].embedding | |
except Exception as e: | |
print(f"Error getting embedding: {e}") | |
return None | |
def convert_djvu_to_pdf(djvu_filepath): | |
"""Converts a DJVU file to PDF using djvu2pdf command line tool.""" | |
pdf_filepath = os.path.splitext(djvu_filepath)[0] + ".pdf" | |
command = ["djvu2pdf", djvu_filepath, pdf_filepath] | |
print(f"Converting {os.path.basename(djvu_filepath)} to PDF...") | |
try: | |
result = subprocess.run(command, check=True, capture_output=True, text=True) | |
print(f"Successfully converted to {os.path.basename(pdf_filepath)}") | |
return pdf_filepath | |
except FileNotFoundError: | |
print(f"Error: 'djvu2pdf' command not found. Please install djvulibre.") | |
return None | |
except subprocess.CalledProcessError as e: | |
print(f"Error during conversion: {e}") | |
print(f"Stderr: {e.stderr}") | |
return None | |
except Exception as e: | |
print(f"An unexpected error occurred during conversion: {e}") | |
return None | |
def process_and_upsert_pdf(pdf_filepath, original_filename): | |
"""Reads PDF, chunks text, gets embeddings, and upserts to Pinecone.""" | |
print(f"Processing PDF for Pinecone: {os.path.basename(pdf_filepath)}") | |
doc_id = str(uuid.uuid4()) | |
try: | |
reader = PdfReader(pdf_filepath) | |
text = "" | |
for page in reader.pages: | |
page_text = page.extract_text() | |
if page_text: # Add text only if extraction succeeded | |
text += page_text + "\n" | |
if not text: | |
print(f"Warning: No text extracted from {os.path.basename(pdf_filepath)}") | |
return f"Processed (No Text): {original_filename}" | |
content_length = len(text) | |
print(f"Extracted text length: {content_length}") | |
# Simple chunking (adjust size as needed) | |
chunk_size = 2000 | |
chunks = [text[i:i+chunk_size] for i in range(0, content_length, chunk_size)] | |
print(f"Split into {len(chunks)} chunks.") | |
vectors = [] | |
for i, chunk in enumerate(chunks): | |
embedding = get_embedding(chunk) | |
if embedding: | |
vectors.append(( | |
f"{doc_id}_{i}", | |
embedding, | |
{ | |
"text": chunk, | |
"type": "PDF", | |
"doc_id": doc_id, | |
"doc_name": original_filename, # Store the original filename | |
"chunk_index": i | |
} | |
)) | |
else: | |
print(f"Skipping chunk {i} due to embedding error.") | |
if not vectors: | |
print("No vectors generated for upsert.") | |
return f"Processed (No Vectors): {original_filename}" | |
# Upsert in batches if necessary (Pinecone recommends batching) | |
batch_size = 100 | |
for i in range(0, len(vectors), batch_size): | |
batch = vectors[i:i+batch_size] | |
try: | |
index.upsert(vectors=batch) | |
print(f"Upserted batch {i//batch_size + 1} to Pinecone.") | |
except Exception as e: | |
print(f"Error upserting batch to Pinecone: {e}") | |
# Decide if you want to stop or continue with other batches | |
print(f"Successfully processed and upserted {original_filename} to Pinecone.") | |
return f"Upserted: {original_filename}" | |
except Exception as e: | |
print(f"Error processing PDF {os.path.basename(pdf_filepath)}: {e}") | |
return f"Error (Processing): {original_filename}" | |
def get_final_download_link(intermediate_page_url): | |
"""Visits an intermediate page (e.g., books.ms/main/HASH) | |
and scrapes the final download link using the selector #download a. | |
""" | |
print(f"Fetching final link from intermediate page: {intermediate_page_url}") | |
try: | |
# Update Host header for the specific request | |
request_headers = base_headers.copy() | |
parsed_url = urlparse(intermediate_page_url) | |
if parsed_url.netloc: | |
request_headers['Host'] = parsed_url.netloc | |
response = requests.get(intermediate_page_url, headers=request_headers, timeout=20) | |
response.raise_for_status() | |
tree = html.fromstring(response.content) | |
found_link = None | |
# --- Attempt the simple, effective selector --- | |
css_selector = "#download a" # Target first anchor within #download | |
print(f"Attempting CSS selector: {css_selector}") | |
link_elements = tree.cssselect(css_selector) | |
if link_elements: | |
link_tag = link_elements[0] # Take the first one found | |
href = link_tag.get('href') | |
if href: | |
parsed_href = urlparse(href) | |
# Validation: | |
if (parsed_href.scheme and parsed_href.netloc and | |
'/main/' in parsed_href.path and | |
any(parsed_href.hostname.endswith(pattern) for pattern in DOWNLOAD_HOST_PATTERNS)): | |
print(f"Found final download link via CSS selector: {href}") | |
found_link = href | |
else: | |
# If the first link doesn't validate, maybe log it but don't proceed | |
print(f"Selector '{css_selector}' found link, but failed validation: {href}") | |
else: | |
print(f"Selector '{css_selector}' found link tag, but it has no href.") | |
else: | |
print(f"CSS selector {css_selector} did not find any elements.") | |
# --- Return result --- | |
if found_link: | |
return found_link | |
else: | |
# If no valid link was found after checking the first #download a | |
print(f"Could not find valid download link using CSS selector '{css_selector}' on {intermediate_page_url}") | |
return None | |
except requests.exceptions.Timeout: | |
print(f"Timeout error fetching intermediate page {intermediate_page_url}") | |
return None | |
except requests.exceptions.RequestException as e: | |
if e.response is not None: | |
print(f"Error fetching intermediate page {intermediate_page_url}: Status {e.response.status_code}") | |
else: | |
print(f"Error fetching intermediate page {intermediate_page_url}: {e}") | |
return None | |
except Exception as e: | |
print(f"Error parsing intermediate page {intermediate_page_url} with lxml: {e}") | |
return None | |
def download_file_and_process(download_url, download_dir): | |
"""Downloads file, converts DJVU to PDF if needed, and triggers Pinecone upsert. | |
Returns a status message. | |
""" | |
if not download_url: | |
return "Skipped: No download URL provided." | |
processing_status = "" # To store the outcome of PDF processing/upserting | |
original_filename = "Unknown" | |
final_filepath = None # Path to the file to be processed (PDF) | |
djvu_filepath_to_delete = None | |
try: | |
# --- Downloading --- | |
parsed_url = urlparse(download_url) | |
path_parts = [part for part in parsed_url.path.split('/') if part] | |
filename_base = unquote(path_parts[-1]) if path_parts else f"download_{int(time.time())}" | |
original_filename = filename_base # Keep original name for metadata | |
print(f"Attempting to download: {download_url}") | |
response = requests.get(download_url, headers=base_headers, stream=True, timeout=120) # Increased timeout | |
response.raise_for_status() | |
# --- Determine File Path and Extension --- | |
content_disposition = response.headers.get('Content-Disposition') | |
extension = '.pdf' # Default | |
if content_disposition: | |
if 'filename=' in content_disposition: | |
disp_filename = content_disposition.split('filename=')[-1].strip('"\'') | |
if '.' in disp_filename: | |
extension = os.path.splitext(disp_filename)[1].lower() | |
else: | |
# Check extension from URL path if no content-disposition | |
if '.' in filename_base: | |
url_ext = os.path.splitext(filename_base)[1].lower() | |
if url_ext in ['.pdf', '.djvu', '.epub', '.mobi']: # Add other relevant types if needed | |
extension = url_ext | |
filename = filename_base | |
if not filename.lower().endswith(extension): | |
filename += extension | |
download_filepath = os.path.join(download_dir, filename) | |
# --- Save File --- | |
if os.path.exists(download_filepath): | |
print(f"File already exists: {filename}") | |
# Decide if we should still process it for Pinecone | |
if download_filepath.lower().endswith('.pdf'): | |
final_filepath = download_filepath | |
elif download_filepath.lower().endswith('.djvu'): | |
# Check if corresponding PDF exists from previous run | |
pdf_equiv = os.path.splitext(download_filepath)[0] + ".pdf" | |
if os.path.exists(pdf_equiv): | |
print(f"Corresponding PDF already exists: {os.path.basename(pdf_equiv)}") | |
final_filepath = pdf_equiv | |
else: | |
# Convert existing DJVU | |
print("DJVU exists but PDF doesn't. Converting...") | |
converted_pdf = convert_djvu_to_pdf(download_filepath) | |
if converted_pdf: | |
final_filepath = converted_pdf | |
else: | |
return f"Skipped (Exists, Non-PDF/DJVU): {filename}" | |
else: | |
print(f"Downloading to: {download_filepath}") | |
with open(download_filepath, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
print(f"Successfully downloaded: {filename}") | |
# --- Post-Download Processing --- | |
if download_filepath.lower().endswith('.pdf'): | |
final_filepath = download_filepath | |
elif download_filepath.lower().endswith('.djvu'): | |
converted_pdf = convert_djvu_to_pdf(download_filepath) | |
if converted_pdf: | |
final_filepath = converted_pdf | |
djvu_filepath_to_delete = download_filepath # Mark original for deletion | |
else: | |
print(f"Downloaded non-PDF/DJVU file: {filename}. Skipping Pinecone process.") | |
return f"Success (DL Only): {filename}" | |
# --- Pinecone Upsert Trigger --- | |
if final_filepath and os.path.exists(final_filepath): | |
processing_status = process_and_upsert_pdf(final_filepath, original_filename) | |
# Optional: Delete original DJVU after successful conversion and processing | |
if djvu_filepath_to_delete and 'Error' not in processing_status: | |
try: | |
os.remove(djvu_filepath_to_delete) | |
print(f"Deleted original DJVU: {os.path.basename(djvu_filepath_to_delete)}") | |
except Exception as e: | |
print(f"Error deleting DJVU file {os.path.basename(djvu_filepath_to_delete)}: {e}") | |
else: | |
processing_status = "Skipped Upsert (No PDF)" | |
return f"Download OK. Status: {processing_status}" | |
except requests.exceptions.Timeout: | |
print(f"Timeout error downloading {download_url}") | |
return f"Error (Timeout): {original_filename}" | |
except requests.exceptions.RequestException as e: | |
print(f"Error downloading {download_url}: {e}") | |
return f"Error (RequestException): {original_filename}" | |
except Exception as e: | |
print(f"An unexpected error occurred during download/process of {original_filename}: {e}") | |
return f"Error (Unexpected): {original_filename}" | |
# --- Main Execution --- | |
input_filename = 'links.json' | |
download_dir = 'downloads' | |
max_workers = 3 # Reduce workers slightly due to processing load | |
# Create download directory if it doesn't exist | |
if not os.path.exists(download_dir): | |
os.makedirs(download_dir) | |
print(f"Created directory: {download_dir}") | |
# --- Read original libgen.rs book page URLs --- | |
try: | |
with open(input_filename, 'r') as f: | |
# Load all URLs as originally intended | |
libgen_book_page_urls = json.load(f) | |
print(f"Loaded {len(libgen_book_page_urls)} libgen.rs book page URLs from {input_filename}") | |
except FileNotFoundError: | |
print(f"Error: {input_filename} not found. Please run the initial link scraping part first.") | |
exit() | |
except json.JSONDecodeError: | |
print(f"Error: Could not decode JSON from {input_filename}.") | |
exit() | |
if not libgen_book_page_urls: | |
print("No book page URLs found in the file. Exiting.") | |
exit() | |
# --- Stage 1: Construct intermediate URLs and get final download links --- | |
final_download_links = [] | |
intermediate_urls_to_try = [] | |
print("\n--- Constructing Intermediate URLs ---") | |
# Process all URLs again | |
for url in libgen_book_page_urls: | |
try: | |
parsed_libgen_url = urlparse(url) | |
query_params = parse_qs(parsed_libgen_url.query) | |
md5_list = query_params.get('md5') | |
if md5_list: | |
md5 = md5_list[0] | |
intermediate_url = f"http://books.ms/main/{md5}" | |
intermediate_urls_to_try.append(intermediate_url) | |
# Maybe remove verbose printing for full run | |
# print(f"Constructed: {intermediate_url} from {url}") | |
else: | |
print(f"Could not extract MD5 from {url}") | |
except Exception as e: | |
print(f"Error processing libgen URL {url}: {e}") | |
print(f"\n--- Fetching Final Download Links from {len(intermediate_urls_to_try)} intermediate URLs ---") | |
if intermediate_urls_to_try: | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
# Use the renamed function get_final_download_link | |
future_to_url = {executor.submit(get_final_download_link, intermediate_url): intermediate_url for intermediate_url in intermediate_urls_to_try} | |
for future in as_completed(future_to_url): | |
intermediate_url = future_to_url[future] | |
try: | |
result = future.result() | |
if result: | |
final_download_links.append(result) | |
except Exception as exc: | |
print(f'Fetching final download link for {intermediate_url} generated an exception: {exc}') | |
print(f"\nFound {len(final_download_links)} final download links.") | |
# --- Stage 2: Download, Convert, and Process files concurrently --- | |
print("\n--- Downloading, Converting, Processing Files ---") | |
download_process_results = [] | |
if final_download_links: | |
# Use the new function that handles download, conversion, and upsert trigger | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
future_to_download = {executor.submit(download_file_and_process, link, download_dir): link for link in final_download_links} | |
for future in as_completed(future_to_download): | |
link = future_to_download[future] | |
try: | |
result_message = future.result() | |
download_process_results.append(result_message) | |
print(f"Overall Result for {link}: {result_message}") | |
except Exception as exc: | |
print(f'Download/Processing {link} generated an exception: {exc}') | |
download_process_results.append(f"Error (Exception): {link}") | |
else: | |
print("No final download links found, skipping download/process stage.") | |
# --- Final Summary --- | |
print("\n--- Final Summary ---") | |
# Analyze the results strings for a more detailed summary (optional) | |
success_upsert_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Upserted')) | |
success_dl_only_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Success (DL Only)')) | |
success_no_text_count = sum(1 for r in download_process_results if r.startswith('Download OK. Status: Processed (No Text)')) | |
skipped_dl_count = sum(1 for r in download_process_results if r.startswith('Skipped')) | |
error_count = len(download_process_results) - success_upsert_count - success_dl_only_count - success_no_text_count - skipped_dl_count | |
print(f"Total final links attempted: {len(final_download_links)}") | |
print(f"Successfully Downloaded & Upserted to Pinecone: {success_upsert_count}") | |
print(f"Successfully Downloaded (Non-PDF/DJVU or Skipped Upsert): {success_dl_only_count + success_no_text_count}") | |
print(f"Skipped Download (e.g., already exists): {skipped_dl_count}") | |
print(f"Errors (Download/Conversion/Process/Upsert): {error_count}") | |
print(f"\nDownloads attempted in the '{download_dir}' directory.") | |
# --- End Main Execution --- |