Spaces:
Runtime error
Runtime error
Commit
·
271db44
1
Parent(s):
647110d
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import json
|
5 |
+
|
6 |
+
st.title("Web Scraper with Streamlit")
|
7 |
+
|
8 |
+
# User input for the URL
|
9 |
+
url = st.text_input("Enter the URL of the website you want to scrape:")
|
10 |
+
|
11 |
+
if st.button("Scrape Data"):
|
12 |
+
if not url:
|
13 |
+
st.error("Please enter a valid URL.")
|
14 |
+
else:
|
15 |
+
# Send an HTTP GET request to the URL
|
16 |
+
response = requests.get(url)
|
17 |
+
|
18 |
+
if response.status_code == 200:
|
19 |
+
# Parse the HTML content of the page
|
20 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
21 |
+
|
22 |
+
# Extract data from the parsed HTML (e.g., quotes)
|
23 |
+
quotes = soup.find_all("span", class_="text")
|
24 |
+
|
25 |
+
# Create a list to store the extracted quotes
|
26 |
+
extracted_data = [quote.get_text() for quote in quotes]
|
27 |
+
|
28 |
+
# Generate a JSON file
|
29 |
+
json_data = json.dumps(extracted_data, indent=4)
|
30 |
+
|
31 |
+
# Provide a link to download the JSON file
|
32 |
+
st.markdown("### Extracted Data (JSON):")
|
33 |
+
st.text(json_data)
|
34 |
+
st.markdown("### Download JSON")
|
35 |
+
st.markdown(get_binary_file_downloader_html(json_data, "extracted_data.json"), unsafe_allow_html=True)
|
36 |
+
else:
|
37 |
+
st.error("Failed to retrieve the web page. Status code:", response.status_code)
|
38 |
+
|
39 |
+
# Function to create a download link for a JSON file
|
40 |
+
def get_binary_file_downloader_html(json_data, title):
|
41 |
+
json_data = json_data.encode()
|
42 |
+
b64 = base64.b64encode(json_data).decode()
|
43 |
+
href = f'<a href="data:file/json;base64,{b64}" download="{title}">Download JSON</a>'
|
44 |
+
return href
|