File size: 4,945 Bytes
e45d093 5c07ffc e45d093 5c07ffc e45d093 5c07ffc e45d093 5c07ffc e45d093 99b936d e45d093 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
#web interface
import streamlit as st
import pandas as pd
import json
import importlib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
import google_drive_handle as gdrive
from dotenv import load_dotenv
import os
# Load config.json
with open('config.json') as f:
config = json.load(f)
# Set up Chrome WebDriver with options
options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('log-level=3')
# Initialize the Chrome WebDriver
wd = webdriver.Chrome(options=options)
drive = gdrive.authenticate_google_drive()
processed_files = set()
st.markdown(
"""
<style>
.centered {
display: flex;
align-items: center;
justify-content: center;
text-align: center;
}
</style>
""",
unsafe_allow_html=True
)
st.markdown("<h1 class='centered'>Moroccan News Aggregator</h1>", unsafe_allow_html=True)
selected_websites = {}
selected_categories = {}
def save_file_id_mapping(file_id_mapping):
with open("file_id_mapping.json", "w") as file:
json.dump(file_id_mapping, file)
def load_file_id_mapping():
try:
with open("file_id_mapping.json", "r") as file:
return json.load(file)
except FileNotFoundError:
return {} # Return an empty dictionary if the file doesn't exist
file_id_mapping = load_file_id_mapping()
selected_websites = {}
selected_languages = {}
for website, details in config.items():
if st.checkbox(website, key=website):
# Language selection
languages = details.get("languages", {})
if languages and len(languages) > 1:
language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
selected_websites[website] = f"{website}_{language}" # like: hespress_en
else:
language = next(iter(languages.keys()))
selected_websites[website] = website # like: akhbarona
selected_languages[website] = language
# Category selection
categories = languages.get(language, {})
if categories:
categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
selected_categories[website] = categories
# Number of articles input
num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
# Start scraping button
if st.button('Start Scraping'):
with st.spinner('Scraping in progress...'):
progress_bar = st.progress(0)
total_tasks = sum(len(categories) for categories in selected_categories.values())
completed_tasks = 0
for website, module_name in selected_websites.items():
scraper_module = importlib.import_module(module_name)
for category in selected_categories.get(website, []):
try:
language = selected_languages[website]
category_url = config[website]['languages'][language][category]
except KeyError:
st.error(f"KeyError: {category} not found for {language} language in {website}.")
if 'category_name' in config[website]:
category_name = config[website]['category_name'].get(category, 'default_category_name')
file_path = scraper_module.scrape_category(category_url, num_articles)
if file_path:
if file_path not in file_id_mapping:
file_id = gdrive.upload_file_to_drive(drive, file_path)
print(f"Uploading file: {file_path}, File ID: {file_id}")
file_id_mapping[file_path] = file_id
save_file_id_mapping(file_id_mapping)
else:
file_id = file_id_mapping[file_path]
print(f"File already uploaded. Using existing File ID: {file_id}")
if file_id:
download_link = gdrive.get_drive_download_link(drive, file_id)
if download_link:
st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
df = pd.read_csv(file_path)
st.write(f"{website} - {category} Data:")
st.dataframe(df)
else:
st.error(f"Failed to retrieve download link for file ID: {file_id}")
else:
st.error(f"Failed to upload file for {website} - {category}")
else:
st.error(f"File not created for {website} - {category}")
st.success('Scraping Completed!')
|