File size: 3,680 Bytes
eeae908 734bffb c77e282 734bffb 6ca2249 88da9f3 6ca2249 31f35d4 96682d9 88da9f3 64b2d1d 88da9f3 c77e282 64b2d1d 734bffb 52a320c 64b2d1d fea2cb5 64b2d1d 2a11a1f 64b2d1d 31f35d4 64b2d1d 2a11a1f 64b2d1d 52a320c 64b2d1d 734bffb 64b2d1d 88da9f3 0a6bdb5 52a320c 6ca2249 64b2d1d 0a6bdb5 64b2d1d 0a6bdb5 64b2d1d 0a6bdb5 2a11a1f 64b2d1d fea2cb5 88da9f3 734bffb 88da9f3 64b2d1d 734bffb 31f35d4 0a6bdb5 88da9f3 6ca2249 734bffb 0a6bdb5 734bffb 88da9f3 734bffb fea2cb5 64b2d1d 09d051e 64b2d1d 09d051e 64b2d1d 09d051e 2a11a1f 64b2d1d 11fd592 fea2cb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
import requests
import re
import os
import zipfile
import tempfile
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def process_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://www.radiofrance.fr/'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
except Exception as e:
return None, f"Erreur de connexion : {str(e)}"
soup = BeautifulSoup(response.text, 'html.parser')
mp3_links = []
# Nouvelle méthode de détection
scripts = soup.find_all('script', type='application/ld+json')
for script in scripts:
if script.string:
matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string)
for match in matches:
full_url = urljoin(url, match.split('?')[0])
if full_url not in mp3_links:
mp3_links.append(full_url)
# Fallback pour les URLs dans les attributs data
if not mp3_links:
for tag in soup.find_all(attrs={"data-url": re.compile(r"\.mp3")}):
mp3_url = urljoin(url, tag['data-url'].split('?')[0])
mp3_links.append(mp3_url)
# Dernier recours : recherche globale
if not mp3_links:
matches = re.findall(r'(https?://[^\s"\']+?\.mp3)', response.text)
for match in matches:
clean_url = urljoin(url, match.split('?')[0])
if clean_url not in mp3_links and 'podcast' in clean_url:
mp3_links.append(clean_url)
# Filtrage final
mp3_links = list(dict.fromkeys(mp3_links)) # Supprime les doublons
if not mp3_links:
return None, "Aucun épisode trouvé - Structure de page inconnue"
# Téléchargement et création ZIP
temp_dir = tempfile.mkdtemp()
filenames = []
for idx, mp3_url in enumerate(mp3_links, 1):
try:
filename = f"{idx:02d}_{os.path.basename(mp3_url)}"
filepath = os.path.join(temp_dir, filename)
with requests.get(mp3_url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
filenames.append(filepath)
except Exception:
continue
if not filenames:
return None, "Échec du téléchargement"
zip_path = os.path.join(temp_dir, 'podcast.zip')
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in filenames:
zipf.write(file, arcname=os.path.basename(file))
return zip_path, None
def download_podcast(url):
zip_path, error = process_url(url)
if error:
raise gr.Error(error)
return zip_path
with gr.Blocks() as app:
gr.Markdown("## 🎧 Téléchargeur Radio France")
with gr.Row():
url_input = gr.Textbox(
label="URL du podcast",
placeholder="Ex: https://www.radiofrance.fr/...",
max_lines=1
)
btn = gr.Button("Télécharger les épisodes", variant="primary")
output = gr.File(label="Fichier ZIP")
examples = gr.Examples(
examples=[[
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
]],
inputs=[url_input]
)
btn.click(
fn=download_podcast,
inputs=url_input,
outputs=output,
api_name="download"
)
app.launch() |