PodMagic / app.py
Ribot's picture
Update app.py
64b2d1d verified
raw
history blame
3.68 kB
import gradio as gr
import requests
import re
import os
import zipfile
import tempfile
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def process_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://www.radiofrance.fr/'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
except Exception as e:
return None, f"Erreur de connexion : {str(e)}"
soup = BeautifulSoup(response.text, 'html.parser')
mp3_links = []
# Nouvelle méthode de détection
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
if script.string:
matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string)
for match in matches:
full_url = urljoin(url, match.split('?')[0])
if full_url not in mp3_links:
mp3_links.append(full_url)
# Fallback pour les URLs dans les attributs data
if not mp3_links:
for tag in soup.find_all(attrs={"data-url": re.compile(r"\.mp3")}):
mp3_url = urljoin(url, tag['data-url'].split('?')[0])
mp3_links.append(mp3_url)
# Dernier recours : recherche globale
if not mp3_links:
matches = re.findall(r'(https?://[^\s"\']+?\.mp3)', response.text)
for match in matches:
clean_url = urljoin(url, match.split('?')[0])
if clean_url not in mp3_links and 'podcast' in clean_url:
mp3_links.append(clean_url)
# Filtrage final
mp3_links = list(dict.fromkeys(mp3_links)) # Supprime les doublons
if not mp3_links:
return None, "Aucun épisode trouvé - Structure de page inconnue"
# Téléchargement et création ZIP
temp_dir = tempfile.mkdtemp()
filenames = []
for idx, mp3_url in enumerate(mp3_links, 1):
try:
filename = f"{idx:02d}_{os.path.basename(mp3_url)}"
filepath = os.path.join(temp_dir, filename)
with requests.get(mp3_url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
filenames.append(filepath)
except Exception:
continue
if not filenames:
return None, "Échec du téléchargement"
zip_path = os.path.join(temp_dir, 'podcast.zip')
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in filenames:
zipf.write(file, arcname=os.path.basename(file))
return zip_path, None
def download_podcast(url):
zip_path, error = process_url(url)
if error:
raise gr.Error(error)
return zip_path
with gr.Blocks() as app:
gr.Markdown("## 🎧 Téléchargeur Radio France")
with gr.Row():
url_input = gr.Textbox(
label="URL du podcast",
placeholder="Ex: https://www.radiofrance.fr/...",
max_lines=1
)
btn = gr.Button("Télécharger les épisodes", variant="primary")
output = gr.File(label="Fichier ZIP")
examples = gr.Examples(
examples=[[
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
]],
inputs=[url_input]
)
btn.click(
fn=download_podcast,
inputs=url_input,
outputs=output,
api_name="download"
)
app.launch()