Spaces:

te
/

colombia_birds_quiz

Runtime error

colombia_birds_quiz / scrapper.py

first commit

b152d54 over 1 year ago

2.4 kB

	import requests
	from bs4 import BeautifulSoup
	import re


	def get_image_url(regionCode, taxonCode):
	url = f"https://search.macaulaylibrary.org/catalog?regionCode={regionCode}&taxonCode={taxonCode}&sort=rating_rank_desc&mediaType=photo"
	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, 'html.parser')
	script_str = soup.find_all('script')
	match = re.search(r'assetId:(\d+),', str(script_str))
	if match:
	asset_id = match.group(1)
	return asset_id
	else:
	return None
	else:
	return None


	# Set the URL of the webpage to be scraped

	url = "https://search.macaulaylibrary.org/catalog?regionCode=CO-ANT&taxonCode=bertin1&sort=rating_rank_desc&mediaType=photo"

	# Send a GET request to the URL
	response = requests.get(url)

	# Check if the request was successful (status code 200)
	if response.status_code == 200:
	# print('ok')
	# Parse the HTML content
	soup = BeautifulSoup(response.content, 'html.parser')
	script_str = soup.find_all('script')
	with open('prueba4.html', 'w', encoding='utf-8') as f:
	f.write(str(script_str))

	# Extract the image URL from the HTML content)
	# enlace = soup.find('meta', {'property': 'og:image'})['content']
	enlace = soup.find_all('div', {'id': 'ResultsGallery-row'})


	# Imprimir el enlace
	print(enlace)

	# with open('prueba2.html', 'w', encoding='utf-8') as f:
	# f.write(str(soup))

	#print(soup.prettify())
	# prueba = soup.find_all('div', {'class': 'ResultsGallery'})
	prueba = soup.find_all('script')
	# print(str(prueba))


	match = re.search(r'assetId:(\d+),', str(prueba))

	if match:
	asset_id = match.group(1)
	print(f"Asset ID: {asset_id}")
	else:
	print("Asset ID not found.")

	# ---------------


	# with open('prueba.html', 'w', encoding='utf-8') as f:
	# f.write(str(prueba))

	# # Now you can use BeautifulSoup to extract information from the HTML
	# # For example, let's extract all image URLs
	# image_urls = [img['src'] for img in soup.find_all('data-asset-id')]

	# # Print the extracted image URLs
	# for i, url in enumerate(image_urls, start=1):
	# print(f"Image {i}: {url}")
	else:
	print(f"Failed to retrieve the webpage. Status code: {response.status_code}")