Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import re | |
def get_image_url(regionCode, taxonCode): | |
url = f"https://search.macaulaylibrary.org/catalog?regionCode={regionCode}&taxonCode={taxonCode}&sort=rating_rank_desc&mediaType=photo" | |
response = requests.get(url) | |
if response.status_code == 200: | |
soup = BeautifulSoup(response.content, 'html.parser') | |
script_str = soup.find_all('script') | |
match = re.search(r'assetId:(\d+),', str(script_str)) | |
if match: | |
asset_id = match.group(1) | |
return asset_id | |
else: | |
return None | |
else: | |
return None | |
# Set the URL of the webpage to be scraped | |
url = "https://search.macaulaylibrary.org/catalog?regionCode=CO-ANT&taxonCode=bertin1&sort=rating_rank_desc&mediaType=photo" | |
# Send a GET request to the URL | |
response = requests.get(url) | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
# print('ok') | |
# Parse the HTML content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
script_str = soup.find_all('script') | |
with open('prueba4.html', 'w', encoding='utf-8') as f: | |
f.write(str(script_str)) | |
# Extract the image URL from the HTML content) | |
# enlace = soup.find('meta', {'property': 'og:image'})['content'] | |
enlace = soup.find_all('div', {'id': 'ResultsGallery-row'}) | |
# Imprimir el enlace | |
print(enlace) | |
# with open('prueba2.html', 'w', encoding='utf-8') as f: | |
# f.write(str(soup)) | |
#print(soup.prettify()) | |
# prueba = soup.find_all('div', {'class': 'ResultsGallery'}) | |
prueba = soup.find_all('script') | |
# print(str(prueba)) | |
match = re.search(r'assetId:(\d+),', str(prueba)) | |
if match: | |
asset_id = match.group(1) | |
print(f"Asset ID: {asset_id}") | |
else: | |
print("Asset ID not found.") | |
# --------------- | |
# with open('prueba.html', 'w', encoding='utf-8') as f: | |
# f.write(str(prueba)) | |
# # Now you can use BeautifulSoup to extract information from the HTML | |
# # For example, let's extract all image URLs | |
# image_urls = [img['src'] for img in soup.find_all('data-asset-id')] | |
# # Print the extracted image URLs | |
# for i, url in enumerate(image_urls, start=1): | |
# print(f"Image {i}: {url}") | |
else: | |
print(f"Failed to retrieve the webpage. Status code: {response.status_code}") |