Spaces:
Runtime error
Runtime error
import streamlit as st | |
import requests | |
from bs4 import BeautifulSoup | |
import base64 | |
import json | |
st.title("Web Scraper with Streamlit") | |
st.text("This website is capable to webscrape the html based websites but not for Dynamic based") | |
st.text("example:") | |
st.text("1. https://books.toscrape.com/") | |
st.text("2. http://quotes.toscrape.com") | |
# User input for the URL | |
url = st.text_input("Enter the URL of the website you want to scrape:") | |
# Function to create a download link for a JSON file | |
def get_binary_file_downloader_html(json_data, title): | |
json_data = json_data.encode() | |
b64 = base64.b64encode(json_data).decode() | |
href = f'<a href="data:file/json;base64,{b64}" download="{title}">Download JSON</a>' | |
return href | |
if st.button("Scrape Data"): | |
if not url: | |
st.error("Please enter a valid URL.") | |
else: | |
# Send an HTTP GET request to the URL | |
response = requests.get(url) | |
if response.status_code == 200: | |
# Parse the HTML content of the page | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Extract data from the parsed HTML (e.g., quotes) | |
quotes = soup.find_all("span", class_="text") | |
# Create a list to store the extracted quotes | |
extracted_data = [quote.get_text() for quote in quotes] | |
# Generate a JSON file | |
json_data = json.dumps(extracted_data, indent=4) | |
# Provide a link to download the JSON file | |
st.markdown("### Extracted Data (JSON):") | |
st.text(json_data) | |
st.markdown("### Download JSON") | |
st.markdown(get_binary_file_downloader_html(json_data, "extracted_data.json"), unsafe_allow_html=True) | |
else: | |
st.error("Failed to retrieve the web page. Status code:", response.status_code) | |