Web_to_json / app.py
gouravgujariya's picture
Update app.py
a7cdae2
import streamlit as st
import requests
from bs4 import BeautifulSoup
import base64
import json
st.title("Web Scraper with Streamlit")
st.text("This website is capable to webscrape the html based websites but not for Dynamic based")
st.text("example:")
st.text("1. https://books.toscrape.com/")
st.text("2. http://quotes.toscrape.com")
# User input for the URL
url = st.text_input("Enter the URL of the website you want to scrape:")
# Function to create a download link for a JSON file
def get_binary_file_downloader_html(json_data, title):
json_data = json_data.encode()
b64 = base64.b64encode(json_data).decode()
href = f'<a href="data:file/json;base64,{b64}" download="{title}">Download JSON</a>'
return href
if st.button("Scrape Data"):
if not url:
st.error("Please enter a valid URL.")
else:
# Send an HTTP GET request to the URL
response = requests.get(url)
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")
# Extract data from the parsed HTML (e.g., quotes)
quotes = soup.find_all("span", class_="text")
# Create a list to store the extracted quotes
extracted_data = [quote.get_text() for quote in quotes]
# Generate a JSON file
json_data = json.dumps(extracted_data, indent=4)
# Provide a link to download the JSON file
st.markdown("### Extracted Data (JSON):")
st.text(json_data)
st.markdown("### Download JSON")
st.markdown(get_binary_file_downloader_html(json_data, "extracted_data.json"), unsafe_allow_html=True)
else:
st.error("Failed to retrieve the web page. Status code:", response.status_code)