import requests from bs4 import BeautifulSoup import re def get_image_url(regionCode, taxonCode): url = f"https://search.macaulaylibrary.org/catalog?regionCode={regionCode}&taxonCode={taxonCode}&sort=rating_rank_desc&mediaType=photo" response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') script_str = soup.find_all('script') match = re.search(r'assetId:(\d+),', str(script_str)) if match: asset_id = match.group(1) return asset_id else: return None else: return None # Set the URL of the webpage to be scraped url = "https://search.macaulaylibrary.org/catalog?regionCode=CO-ANT&taxonCode=bertin1&sort=rating_rank_desc&mediaType=photo" # Send a GET request to the URL response = requests.get(url) # Check if the request was successful (status code 200) if response.status_code == 200: # print('ok') # Parse the HTML content soup = BeautifulSoup(response.content, 'html.parser') script_str = soup.find_all('script') with open('prueba4.html', 'w', encoding='utf-8') as f: f.write(str(script_str)) # Extract the image URL from the HTML content) # enlace = soup.find('meta', {'property': 'og:image'})['content'] enlace = soup.find_all('div', {'id': 'ResultsGallery-row'}) # Imprimir el enlace print(enlace) # with open('prueba2.html', 'w', encoding='utf-8') as f: # f.write(str(soup)) #print(soup.prettify()) # prueba = soup.find_all('div', {'class': 'ResultsGallery'}) prueba = soup.find_all('script') # print(str(prueba)) match = re.search(r'assetId:(\d+),', str(prueba)) if match: asset_id = match.group(1) print(f"Asset ID: {asset_id}") else: print("Asset ID not found.") # --------------- # with open('prueba.html', 'w', encoding='utf-8') as f: # f.write(str(prueba)) # # Now you can use BeautifulSoup to extract information from the HTML # # For example, let's extract all image URLs # image_urls = [img['src'] for img in soup.find_all('data-asset-id')] # # Print the extracted image URLs # for i, url in enumerate(image_urls, start=1): # print(f"Image {i}: {url}") else: print(f"Failed to retrieve the webpage. Status code: {response.status_code}")