import streamlit as st import requests from bs4 import BeautifulSoup import base64 import json st.title("Web Scraper with Streamlit") st.text("This website is capable to webscrape the html based websites but not for Dynamic based") st.text("example:") st.text("1. https://books.toscrape.com/") st.text("2. http://quotes.toscrape.com") # User input for the URL url = st.text_input("Enter the URL of the website you want to scrape:") # Function to create a download link for a JSON file def get_binary_file_downloader_html(json_data, title): json_data = json_data.encode() b64 = base64.b64encode(json_data).decode() href = f'Download JSON' return href if st.button("Scrape Data"): if not url: st.error("Please enter a valid URL.") else: # Send an HTTP GET request to the URL response = requests.get(url) if response.status_code == 200: # Parse the HTML content of the page soup = BeautifulSoup(response.text, "html.parser") # Extract data from the parsed HTML (e.g., quotes) quotes = soup.find_all("span", class_="text") # Create a list to store the extracted quotes extracted_data = [quote.get_text() for quote in quotes] # Generate a JSON file json_data = json.dumps(extracted_data, indent=4) # Provide a link to download the JSON file st.markdown("### Extracted Data (JSON):") st.text(json_data) st.markdown("### Download JSON") st.markdown(get_binary_file_downloader_html(json_data, "extracted_data.json"), unsafe_allow_html=True) else: st.error("Failed to retrieve the web page. Status code:", response.status_code)