#web interface
import streamlit as st
import pandas as pd
import json
import importlib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
import google_drive_handle as gdrive
from dotenv import load_dotenv
import os
# Load config.json
with open('config.json') as f:
config = json.load(f)
# Set up Chrome WebDriver with options
options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('log-level=3')
# Initialize the Chrome WebDriver
wd = webdriver.Chrome(options=options)
drive = gdrive.authenticate_google_drive()
processed_files = set()
st.markdown(
"""
""",
unsafe_allow_html=True
)
st.markdown("
Moroccan News Aggregator
", unsafe_allow_html=True)
selected_websites = {}
selected_categories = {}
def save_file_id_mapping(file_id_mapping):
with open("file_id_mapping.json", "w") as file:
json.dump(file_id_mapping, file)
def load_file_id_mapping():
try:
with open("file_id_mapping.json", "r") as file:
return json.load(file)
except FileNotFoundError:
return {} # Return an empty dictionary if the file doesn't exist
file_id_mapping = load_file_id_mapping()
selected_websites = {}
selected_languages = {}
for website, details in config.items():
if st.checkbox(website, key=website):
# Language selection
languages = details.get("languages", {})
if languages and len(languages) > 1:
language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
selected_websites[website] = f"{website}_{language}" # like: hespress_en
else:
language = next(iter(languages.keys()))
selected_websites[website] = website # like: akhbarona
selected_languages[website] = language
# Category selection
categories = languages.get(language, {})
if categories:
categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
selected_categories[website] = categories
# Number of articles input
num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
# Start scraping button
if st.button('Start Scraping'):
with st.spinner('Scraping in progress...'):
progress_bar = st.progress(0)
total_tasks = sum(len(categories) for categories in selected_categories.values())
completed_tasks = 0
for website, module_name in selected_websites.items():
scraper_module = importlib.import_module(module_name)
for category in selected_categories.get(website, []):
try:
language = selected_languages[website]
category_url = config[website]['languages'][language][category]
except KeyError:
st.error(f"KeyError: {category} not found for {language} language in {website}.")
if 'category_name' in config[website]:
category_name = config[website]['category_name'].get(category, 'default_category_name')
file_path = scraper_module.scrape_category(category_url, num_articles)
if file_path:
if file_path not in file_id_mapping:
file_id = gdrive.upload_file_to_drive(drive, file_path)
print(f"Uploading file: {file_path}, File ID: {file_id}")
file_id_mapping[file_path] = file_id
save_file_id_mapping(file_id_mapping)
else:
file_id = file_id_mapping[file_path]
print(f"File already uploaded. Using existing File ID: {file_id}")
if file_id:
download_link = gdrive.get_drive_download_link(drive, file_id)
if download_link:
st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
df = pd.read_csv(file_path)
st.write(f"{website} - {category} Data:")
st.dataframe(df)
else:
st.error(f"Failed to retrieve download link for file ID: {file_id}")
else:
st.error(f"Failed to upload file for {website} - {category}")
else:
st.error(f"File not created for {website} - {category}")
st.success('Scraping Completed!')