colombia_birds_quiz / scrapper.py
te's picture
first commit
b152d54
import requests
from bs4 import BeautifulSoup
import re
def get_image_url(regionCode, taxonCode):
url = f"https://search.macaulaylibrary.org/catalog?regionCode={regionCode}&taxonCode={taxonCode}&sort=rating_rank_desc&mediaType=photo"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
script_str = soup.find_all('script')
match = re.search(r'assetId:(\d+),', str(script_str))
if match:
asset_id = match.group(1)
return asset_id
else:
return None
else:
return None
# Set the URL of the webpage to be scraped
url = "https://search.macaulaylibrary.org/catalog?regionCode=CO-ANT&taxonCode=bertin1&sort=rating_rank_desc&mediaType=photo"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# print('ok')
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
script_str = soup.find_all('script')
with open('prueba4.html', 'w', encoding='utf-8') as f:
f.write(str(script_str))
# Extract the image URL from the HTML content)
# enlace = soup.find('meta', {'property': 'og:image'})['content']
enlace = soup.find_all('div', {'id': 'ResultsGallery-row'})
# Imprimir el enlace
print(enlace)
# with open('prueba2.html', 'w', encoding='utf-8') as f:
# f.write(str(soup))
#print(soup.prettify())
# prueba = soup.find_all('div', {'class': 'ResultsGallery'})
prueba = soup.find_all('script')
# print(str(prueba))
match = re.search(r'assetId:(\d+),', str(prueba))
if match:
asset_id = match.group(1)
print(f"Asset ID: {asset_id}")
else:
print("Asset ID not found.")
# ---------------
# with open('prueba.html', 'w', encoding='utf-8') as f:
# f.write(str(prueba))
# # Now you can use BeautifulSoup to extract information from the HTML
# # For example, let's extract all image URLs
# image_urls = [img['src'] for img in soup.find_all('data-asset-id')]
# # Print the extracted image URLs
# for i, url in enumerate(image_urls, start=1):
# print(f"Image {i}: {url}")
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")