File size: 2,399 Bytes
b152d54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
import re


def get_image_url(regionCode, taxonCode):
    url = f"https://search.macaulaylibrary.org/catalog?regionCode={regionCode}&taxonCode={taxonCode}&sort=rating_rank_desc&mediaType=photo"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        script_str = soup.find_all('script')
        match = re.search(r'assetId:(\d+),', str(script_str))
        if match:
            asset_id = match.group(1)
            return asset_id
        else:
            return None
    else:
        return None


# Set the URL of the webpage to be scraped

url = "https://search.macaulaylibrary.org/catalog?regionCode=CO-ANT&taxonCode=bertin1&sort=rating_rank_desc&mediaType=photo"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # print('ok')
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    script_str = soup.find_all('script')
    with open('prueba4.html', 'w', encoding='utf-8') as f:
        f.write(str(script_str))

    # Extract the image URL from the HTML content)
    # enlace = soup.find('meta', {'property': 'og:image'})['content']
    enlace = soup.find_all('div', {'id': 'ResultsGallery-row'})
    

    # Imprimir el enlace
    print(enlace)

    # with open('prueba2.html', 'w', encoding='utf-8') as f:
    #     f.write(str(soup))

    #print(soup.prettify())
    # prueba = soup.find_all('div', {'class': 'ResultsGallery'})
    prueba = soup.find_all('script')
    # print(str(prueba))
    

    match = re.search(r'assetId:(\d+),', str(prueba))

    if match:
        asset_id = match.group(1)
        print(f"Asset ID: {asset_id}")
    else:
        print("Asset ID not found.")

    # ---------------


    # with open('prueba.html', 'w', encoding='utf-8') as f:
    #     f.write(str(prueba))

    # # Now you can use BeautifulSoup to extract information from the HTML
    # # For example, let's extract all image URLs
    # image_urls = [img['src'] for img in soup.find_all('data-asset-id')]

    # # Print the extracted image URLs
    # for i, url in enumerate(image_urls, start=1):
    #     print(f"Image {i}: {url}")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")