File size: 5,597 Bytes
d6cf17e
 
66e5432
 
 
1383c28
 
 
 
 
d6cf17e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a65c48e
d6cf17e
1383c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7d8a13
 
 
 
1383c28
 
 
 
 
 
 
 
e7d8a13
1383c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01e09d8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import gdown
import os
import numpy as np
import torch
import onnxruntime
from urllib.parse import urlparse, parse_qs, urljoin
import requests
from bs4 import BeautifulSoup
import time
from collections import deque

def download_pdf_from_gdrive(url, output_path=None):
    """
    Download a PDF file from Google Drive using the provided sharing URL.

    Parameters:
    url (str): The Google Drive sharing URL of the PDF file
    output_path (str, optional): The path where the PDF should be saved.
                                If not provided, saves in current directory.

    Returns:
    str: Path to the downloaded file if successful, None if failed

    Raises:
    ValueError: If the URL is invalid or doesn't point to a Google Drive file
    """
    try:
        # Check if URL is provided
        if not url:
            raise ValueError("URL cannot be empty")

        # Handle different types of Google Drive URLs
        if 'drive.google.com' not in url:
            raise ValueError("Not a valid Google Drive URL")

        # Extract file ID from the URL
        if '/file/d/' in url:
            file_id = url.split('/file/d/')[1].split('/')[0]
        elif 'id=' in url:
            file_id = parse_qs(urlparse(url).query)['id'][0]
        else:
            raise ValueError("Could not extract file ID from the URL")

        # Set default output path if none provided
        if not output_path:
            output_path = 'downloaded_file.pdf'

        # Ensure the output path ends with .pdf
        if not output_path.lower().endswith('.pdf'):
            output_path += '.pdf'

        # Create the directory if it doesn't exist
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)

        # Download the file
        output = gdown.download(id=file_id, output=output_path, quiet=False)

        if output is None:
            raise ValueError("Download failed - file might be inaccessible or not exist")

        return output

    except Exception as e:
        print(f"Error downloading PDF: {str(e)}")
        return None

def merge_strings_with_prefix(strings):
    """Merges strings in a list that start with a specific prefix.

    Args:
    strings: A list of strings.

    Returns:
    A new list of merged strings.
    """

    result = []
    current_merged_string = ""

    for string in strings:
        if string.startswith("•"):
            if current_merged_string:
                result.append(current_merged_string)
            current_merged_string = string
        else:
            current_merged_string += string

    if current_merged_string:
        result.append(current_merged_string)

    return ' '.join(result)

def scrape_website(start_url, delay=1):
    """
    Scrapes all pages of a website and returns their content as a single string.
    
    Args:
        start_url (str): The starting URL of the website
        delay (int): Delay between requests in seconds to be polite
    
    Returns:
        str: Combined content from all pages
    """
    # Initialize sets for tracking
    visited_urls = set()
    domain = urlparse(start_url).netloc
    queue = deque([start_url])
    all_content = []
    
    def is_valid_url(url):
        """Check if URL belongs to the same domain and is a webpage"""
        parsed = urlparse(url)
        return (
            parsed.netloc == domain and
            parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
            '#' not in url
        )
    
    def extract_text_content(soup):
        """Extract meaningful text content from a BeautifulSoup object"""
        # Remove script and style elements
        for script in soup(["script", "style", "header", "footer", "nav"]):
            script.decompose()
            
        # Get text content
        text = soup.get_text(separator=' ', strip=True)
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text
    
    def get_links(soup, base_url):
        """Extract all valid links from a page"""
        links = []
        for a_tag in soup.find_all('a', href=True):
            url = urljoin(base_url, a_tag['href'])
            if is_valid_url(url):
                links.append(url)
        return links
    
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    
    # Main scraping loop
    while queue:
        url = queue.popleft()
        if url in visited_urls:
            continue
            
        try:
            print(f"Scraping: {url}")
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract content
            content = extract_text_content(soup)
            all_content.append(f"URL: {url}\n{content}\n")
            
            # Add new links to queue
            links = get_links(soup, url)
            for link in links:
                if link not in visited_urls:
                    queue.append(link)
            
            visited_urls.add(url)
            time.sleep(delay)  # Be polite
            
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            continue
    
    # Combine all content into a single string
    combined_content = "\n\n".join(all_content)
    return combined_content