Spaces:
Sleeping
Sleeping
File size: 5,597 Bytes
d6cf17e 66e5432 1383c28 d6cf17e a65c48e d6cf17e 1383c28 e7d8a13 1383c28 e7d8a13 1383c28 01e09d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import gdown
import os
import numpy as np
import torch
import onnxruntime
from urllib.parse import urlparse, parse_qs, urljoin
import requests
from bs4 import BeautifulSoup
import time
from collections import deque
def download_pdf_from_gdrive(url, output_path=None):
"""
Download a PDF file from Google Drive using the provided sharing URL.
Parameters:
url (str): The Google Drive sharing URL of the PDF file
output_path (str, optional): The path where the PDF should be saved.
If not provided, saves in current directory.
Returns:
str: Path to the downloaded file if successful, None if failed
Raises:
ValueError: If the URL is invalid or doesn't point to a Google Drive file
"""
try:
# Check if URL is provided
if not url:
raise ValueError("URL cannot be empty")
# Handle different types of Google Drive URLs
if 'drive.google.com' not in url:
raise ValueError("Not a valid Google Drive URL")
# Extract file ID from the URL
if '/file/d/' in url:
file_id = url.split('/file/d/')[1].split('/')[0]
elif 'id=' in url:
file_id = parse_qs(urlparse(url).query)['id'][0]
else:
raise ValueError("Could not extract file ID from the URL")
# Set default output path if none provided
if not output_path:
output_path = 'downloaded_file.pdf'
# Ensure the output path ends with .pdf
if not output_path.lower().endswith('.pdf'):
output_path += '.pdf'
# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
# Download the file
output = gdown.download(id=file_id, output=output_path, quiet=False)
if output is None:
raise ValueError("Download failed - file might be inaccessible or not exist")
return output
except Exception as e:
print(f"Error downloading PDF: {str(e)}")
return None
def merge_strings_with_prefix(strings):
"""Merges strings in a list that start with a specific prefix.
Args:
strings: A list of strings.
Returns:
A new list of merged strings.
"""
result = []
current_merged_string = ""
for string in strings:
if string.startswith("•"):
if current_merged_string:
result.append(current_merged_string)
current_merged_string = string
else:
current_merged_string += string
if current_merged_string:
result.append(current_merged_string)
return ' '.join(result)
def scrape_website(start_url, delay=1):
"""
Scrapes all pages of a website and returns their content as a single string.
Args:
start_url (str): The starting URL of the website
delay (int): Delay between requests in seconds to be polite
Returns:
str: Combined content from all pages
"""
# Initialize sets for tracking
visited_urls = set()
domain = urlparse(start_url).netloc
queue = deque([start_url])
all_content = []
def is_valid_url(url):
"""Check if URL belongs to the same domain and is a webpage"""
parsed = urlparse(url)
return (
parsed.netloc == domain and
parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
'#' not in url
)
def extract_text_content(soup):
"""Extract meaningful text content from a BeautifulSoup object"""
# Remove script and style elements
for script in soup(["script", "style", "header", "footer", "nav"]):
script.decompose()
# Get text content
text = soup.get_text(separator=' ', strip=True)
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
def get_links(soup, base_url):
"""Extract all valid links from a page"""
links = []
for a_tag in soup.find_all('a', href=True):
url = urljoin(base_url, a_tag['href'])
if is_valid_url(url):
links.append(url)
return links
headers = {
'User-Agent': 'Mozilla/5.0'
}
# Main scraping loop
while queue:
url = queue.popleft()
if url in visited_urls:
continue
try:
print(f"Scraping: {url}")
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract content
content = extract_text_content(soup)
all_content.append(f"URL: {url}\n{content}\n")
# Add new links to queue
links = get_links(soup, url)
for link in links:
if link not in visited_urls:
queue.append(link)
visited_urls.add(url)
time.sleep(delay) # Be polite
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
continue
# Combine all content into a single string
combined_content = "\n\n".join(all_content)
return combined_content |