Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import folium | |
from folium.plugins import MarkerCluster | |
import plotly.graph_objects as go | |
import plotly.express as px # Add this import | |
from geopy.geocoders import Nominatim | |
import re | |
import streamlit as st | |
# Streamlit title and description | |
st.title("米其林餐廳指南爬蟲") | |
st.write("Extract restaurant data, visualize with a pie chart and bar chart, and display locations on a map.") | |
# Read data from Google Sheets | |
sheet_id = "1xUfnD1WCF5ldqECI8YXIko1gCpaDDCwTztL17kjI42U" | |
df1 = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv") | |
# Convert "網址" column to a Python list | |
urls = df1["網址"].tolist() | |
# Create a DataFrame to store all restaurant data | |
df = pd.DataFrame(columns=["Store Name", "Address", "Phone", "Latitude", "Longitude", "Region"]) | |
# Initialize Nominatim geocoder | |
geolocator = Nominatim(user_agent="my_app") | |
# Function to extract region (區域) from the address using regex | |
def extract_region(address): | |
match = re.search(r'(.*?)區|縣|市', address) | |
if match: | |
return match.group(0) | |
else: | |
return "Unknown" | |
# Function to fetch and parse data | |
def fetch_data(): | |
global df | |
# Progress bar in Streamlit | |
progress_bar = st.progress(0) | |
total_urls = len(urls) | |
# Iterate through each URL | |
for idx, url in enumerate(urls): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
try: | |
store_name = soup.find("h2", class_="restaurant-details__heading--title").text.strip() | |
except AttributeError: | |
store_name = None | |
try: | |
address = soup.find("li", class_="restaurant-details__heading--address").text.strip() | |
region = extract_region(address) | |
except AttributeError: | |
address = None | |
region = "Unknown" | |
# Try to extract phone number | |
try: | |
phone = soup.find("a", {"data-event": "CTA_tel"}).get("href").replace("tel:", "") | |
except AttributeError: | |
phone = None | |
try: | |
location = geolocator.geocode(address) | |
if location: | |
latitude = location.latitude | |
longitude = location.longitude | |
else: | |
latitude = None | |
longitude = None | |
except: | |
latitude = None | |
longitude = None | |
new_row = pd.DataFrame({ | |
"Store Name": [store_name], | |
"Address": [address], | |
"Phone": [phone], | |
"Latitude": [latitude], | |
"Longitude": [longitude], | |
"Region": [region] | |
}) | |
df = pd.concat([df, new_row], ignore_index=True) | |
# Update progress bar | |
progress_bar.progress((idx + 1) / total_urls) | |
# Button to trigger data fetching | |
if st.button("爬取餐廳資料"): | |
fetch_data() | |
# Save the DataFrame to CSV with UTF-8 encoding, including latitude and longitude | |
csv_file = "restaurants_data.csv" | |
df.to_csv(csv_file, encoding="utf-8-sig", index=False) | |
# Display the DataFrame as a table at the top | |
st.subheader("Restaurant Data") | |
st.dataframe(df) | |
# Display download button for the CSV | |
st.download_button( | |
label="Download restaurant data as CSV", | |
data=open(csv_file, "rb").read(), | |
file_name=csv_file, | |
mime="text/csv" | |
) | |
# Group the data by region and sum the number of restaurants | |
region_group = df.groupby("Region").size().reset_index(name='Count') | |
# Plot enlarged pie chart with custom colors and labels | |
pie_chart = go.Figure(go.Pie( | |
labels=region_group["Region"], | |
values=region_group["Count"], | |
textinfo="label+percent", | |
hoverinfo="label+value", | |
textfont=dict(size=18), | |
marker=dict(colors=px.colors.qualitative.Set3, line=dict(color='#000000', width=2)) | |
)) | |
pie_chart.update_layout( | |
title="Restaurant Distribution by Region", | |
title_x=0.5, | |
title_font=dict(size=24, family="Arial"), | |
height=600, | |
margin=dict(t=50, b=50, l=50, r=50) | |
) | |
st.subheader("Restaurant Distribution by Region (Enlarged Pie Chart)") | |
st.plotly_chart(pie_chart) | |
# Plot bar chart with custom colors and labels | |
bar_chart = go.Figure(go.Bar( | |
x=region_group["Region"], | |
y=region_group["Count"], | |
text=region_group["Count"], | |
textposition='auto', | |
marker=dict(color=px.colors.qualitative.Set2) | |
)) | |
bar_chart.update_layout( | |
title="Restaurant Count by Region", | |
title_x=0.5, | |
title_font=dict(size=24, family="Arial"), | |
height=400, | |
margin=dict(t=50, b=50, l=50, r=50), | |
xaxis_title="Region", | |
yaxis_title="Number of Restaurants", | |
xaxis=dict(tickangle=-45) | |
) | |
st.subheader("Restaurant Count by Region (Bar Chart)") | |
st.plotly_chart(bar_chart) | |
# Display a map using Folium | |
st.subheader("Restaurant Locations Map") | |
# Create map centered around Tainan | |
m = folium.Map(location=[23.0, 120.2], zoom_start=12) | |
# Add marker cluster to the map | |
marker_cluster = MarkerCluster().add_to(m) | |
for index, row in df.iterrows(): | |
if pd.notnull(row["Latitude"]) and pd.notnull(row["Longitude"]): | |
folium.Marker( | |
location=[row["Latitude"], row["Longitude"]], | |
popup=f"{row['Store Name']} ({row['Phone']})", | |
tooltip=row["Address"] | |
).add_to(marker_cluster) | |
# Display the map in Streamlit | |
st.components.v1.html(m._repr_html_(), height=600) | |