Spaces:

suvadityamuk
/

resume-rag

Sleeping

File size: 5,597 Bytes

import gdown
import os
import numpy as np
import torch
import onnxruntime
from urllib.parse import urlparse, parse_qs, urljoin
import requests
from bs4 import BeautifulSoup
import time
from collections import deque

def download_pdf_from_gdrive(url, output_path=None):
    """
    Download a PDF file from Google Drive using the provided sharing URL.

    Parameters:
    url (str): The Google Drive sharing URL of the PDF file
    output_path (str, optional): The path where the PDF should be saved.
                                If not provided, saves in current directory.

    Returns:
    str: Path to the downloaded file if successful, None if failed

    Raises:
    ValueError: If the URL is invalid or doesn't point to a Google Drive file
    """
    try:
        # Check if URL is provided
        if not url:
            raise ValueError("URL cannot be empty")

        # Handle different types of Google Drive URLs
        if 'drive.google.com' not in url:
            raise ValueError("Not a valid Google Drive URL")

        # Extract file ID from the URL
        if '/file/d/' in url:
            file_id = url.split('/file/d/')[1].split('/')[0]
        elif 'id=' in url:
            file_id = parse_qs(urlparse(url).query)['id'][0]
        else:
            raise ValueError("Could not extract file ID from the URL")

        # Set default output path if none provided
        if not output_path:
            output_path = 'downloaded_file.pdf'

        # Ensure the output path ends with .pdf
        if not output_path.lower().endswith('.pdf'):
            output_path += '.pdf'

        # Create the directory if it doesn't exist
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)

        # Download the file
        output = gdown.download(id=file_id, output=output_path, quiet=False)

        if output is None:
            raise ValueError("Download failed - file might be inaccessible or not exist")

        return output

    except Exception as e:
        print(f"Error downloading PDF: {str(e)}")
        return None

def merge_strings_with_prefix(strings):
    """Merges strings in a list that start with a specific prefix.

    Args:
    strings: A list of strings.

    Returns:
    A new list of merged strings.
    """

    result = []
    current_merged_string = ""

    for string in strings:
        if string.startswith("•"):
            if current_merged_string:
                result.append(current_merged_string)
            current_merged_string = string
        else:
            current_merged_string += string

    if current_merged_string:
        result.append(current_merged_string)

    return ' '.join(result)

def scrape_website(start_url, delay=1):
    """
    Scrapes all pages of a website and returns their content as a single string.
    
    Args:
        start_url (str): The starting URL of the website
        delay (int): Delay between requests in seconds to be polite
    
    Returns:
        str: Combined content from all pages
    """
    # Initialize sets for tracking
    visited_urls = set()
    domain = urlparse(start_url).netloc
    queue = deque([start_url])
    all_content = []
    
    def is_valid_url(url):
        """Check if URL belongs to the same domain and is a webpage"""
        parsed = urlparse(url)
        return (
            parsed.netloc == domain and
            parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
            '#' not in url
        )
    
    def extract_text_content(soup):
        """Extract meaningful text content from a BeautifulSoup object"""
        # Remove script and style elements
        for script in soup(["script", "style", "header", "footer", "nav"]):
            script.decompose()
            
        # Get text content
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
    
    def get_links(soup, base_url):
        """Extract all valid links from a page"""
        links = []
        for a_tag in soup.find_all('a', href=True):
            url = urljoin(base_url, a_tag['href'])
            if is_valid_url(url):
                links.append(url)
        return links
    
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    
    # Main scraping loop
    while queue:
        url = queue.popleft()
        if url in visited_urls:
            continue
            
        try:
            print(f"Scraping: {url}")
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract content
            content = extract_text_content(soup)
            all_content.append(f"URL: {url}\n{content}\n")
            
            # Add new links to queue
            links = get_links(soup, url)
            for link in links:
                if link not in visited_urls:
                    queue.append(link)
            
            visited_urls.add(url)
            time.sleep(delay)  # Be polite
            
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            continue
    
    # Combine all content into a single string
    combined_content = "\n\n".join(all_content)
    return combined_content