File size: 1,823 Bytes
271db44
 
 
6ffad4f
271db44
 
 
a7cdae2
 
 
 
271db44
 
 
 
5a381fc
 
 
 
 
 
 
 
271db44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import streamlit as st
import requests
from bs4 import BeautifulSoup
import base64
import json

st.title("Web Scraper with Streamlit")
st.text("This website is capable to webscrape the html based websites but not for Dynamic based")
st.text("example:")
st.text("1. https://books.toscrape.com/")
st.text("2. http://quotes.toscrape.com")

# User input for the URL
url = st.text_input("Enter the URL of the website you want to scrape:")


# Function to create a download link for a JSON file
def get_binary_file_downloader_html(json_data, title):
    json_data = json_data.encode()
    b64 = base64.b64encode(json_data).decode()
    href = f'<a href="data:file/json;base64,{b64}" download="{title}">Download JSON</a>'
    return href

if st.button("Scrape Data"):
    if not url:
        st.error("Please enter a valid URL.")
    else:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract data from the parsed HTML (e.g., quotes)
            quotes = soup.find_all("span", class_="text")

            # Create a list to store the extracted quotes
            extracted_data = [quote.get_text() for quote in quotes]

            # Generate a JSON file
            json_data = json.dumps(extracted_data, indent=4)

            # Provide a link to download the JSON file
            st.markdown("### Extracted Data (JSON):")
            st.text(json_data)
            st.markdown("### Download JSON")
            st.markdown(get_binary_file_downloader_html(json_data, "extracted_data.json"), unsafe_allow_html=True)
        else:
            st.error("Failed to retrieve the web page. Status code:", response.status_code)