Spaces:
Sleeping
Sleeping
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager | |
def start_requests(self): | |
query = input("Enter your search query: ") | |
google_search_url = f"https://www.google.com/search?q={query}" | |
# Set up Selenium | |
options = webdriver.ChromeOptions() | |
options.add_argument('--headless') # Run in headless mode | |
options.add_argument('--disable-gpu') | |
options.add_argument('--no-sandbox') | |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | |
driver.get(google_search_url) | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
driver.quit() | |
urls = [] | |
for link in soup.find_all('a', href=True): | |
href = link['href'] | |
if href.startswith('/url?q='): | |
url = href.split('/url?q=')[1].split('&')[0] | |
if not url.startswith('http'): | |
continue | |
urls.append(url) | |
if len(urls) == self.max_scrapes: | |
break | |
if not urls: | |
self.logger.error("No URLs extracted from Google search results.") | |
return | |
self.logger.info(f"Extracted URLs: {urls}") | |
for url in urls: | |
yield Request(url, callback=self.parse) |