Ribot commited on
Commit
13a82a1
·
verified ·
1 Parent(s): 64b2d1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -27
app.py CHANGED
@@ -7,7 +7,7 @@ import tempfile
7
  from urllib.parse import urljoin
8
  from bs4 import BeautifulSoup
9
 
10
- def process_url(url):
11
  try:
12
  headers = {
13
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
@@ -16,12 +16,12 @@ def process_url(url):
16
  response = requests.get(url, headers=headers)
17
  response.raise_for_status()
18
  except Exception as e:
19
- return None, f"Erreur de connexion : {str(e)}"
20
 
21
  soup = BeautifulSoup(response.text, 'html.parser')
22
  mp3_links = []
23
 
24
- # Nouvelle méthode de détection
25
  scripts = soup.find_all('script', type='application/ld+json')
26
  for script in scripts:
27
  if script.string:
@@ -31,27 +31,23 @@ def process_url(url):
31
  if full_url not in mp3_links:
32
  mp3_links.append(full_url)
33
 
34
- # Fallback pour les URLs dans les attributs data
35
  if not mp3_links:
36
- for tag in soup.find_all(attrs={"data-url": re.compile(r"\.mp3")}):
37
- mp3_url = urljoin(url, tag['data-url'].split('?')[0])
38
- mp3_links.append(mp3_url)
39
 
40
- # Dernier recours : recherche globale
41
- if not mp3_links:
42
- matches = re.findall(r'(https?://[^\s"\']+?\.mp3)', response.text)
43
- for match in matches:
44
- clean_url = urljoin(url, match.split('?')[0])
45
- if clean_url not in mp3_links and 'podcast' in clean_url:
46
- mp3_links.append(clean_url)
47
-
48
- # Filtrage final
49
- mp3_links = list(dict.fromkeys(mp3_links)) # Supprime les doublons
50
 
51
  if not mp3_links:
52
- return None, "Aucun épisode trouvé - Structure de page inconnue"
53
 
54
- # Téléchargement et création ZIP
55
  temp_dir = tempfile.mkdtemp()
56
  filenames = []
57
 
@@ -72,6 +68,7 @@ def process_url(url):
72
  if not filenames:
73
  return None, "Échec du téléchargement"
74
 
 
75
  zip_path = os.path.join(temp_dir, 'podcast.zip')
76
  with zipfile.ZipFile(zip_path, 'w') as zipf:
77
  for file in filenames:
@@ -79,33 +76,43 @@ def process_url(url):
79
 
80
  return zip_path, None
81
 
82
- def download_podcast(url):
83
- zip_path, error = process_url(url)
84
  if error:
85
  raise gr.Error(error)
86
  return zip_path
87
 
88
  with gr.Blocks() as app:
89
- gr.Markdown("## 🎧 Téléchargeur Radio France")
 
90
  with gr.Row():
91
  url_input = gr.Textbox(
92
  label="URL du podcast",
93
  placeholder="Ex: https://www.radiofrance.fr/...",
94
  max_lines=1
95
  )
96
- btn = gr.Button("Télécharger les épisodes", variant="primary")
97
- output = gr.File(label="Fichier ZIP")
 
 
 
 
 
 
 
 
98
 
99
  examples = gr.Examples(
100
  examples=[[
101
- "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
 
102
  ]],
103
- inputs=[url_input]
104
  )
105
 
106
  btn.click(
107
  fn=download_podcast,
108
- inputs=url_input,
109
  outputs=output,
110
  api_name="download"
111
  )
 
7
  from urllib.parse import urljoin
8
  from bs4 import BeautifulSoup
9
 
10
+ def process_url(url, num_episodes):
11
  try:
12
  headers = {
13
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
 
16
  response = requests.get(url, headers=headers)
17
  response.raise_for_status()
18
  except Exception as e:
19
+ return None, f"Erreur : {str(e)}"
20
 
21
  soup = BeautifulSoup(response.text, 'html.parser')
22
  mp3_links = []
23
 
24
+ # Extraction des liens MP3
25
  scripts = soup.find_all('script', type='application/ld+json')
26
  for script in scripts:
27
  if script.string:
 
31
  if full_url not in mp3_links:
32
  mp3_links.append(full_url)
33
 
34
+ # Fallback si nécessaire
35
  if not mp3_links:
36
+ matches = re.findall(r'(https?://media\.radiofrance-podcast\.net[^\s"\']+?\.mp3)', response.text)
37
+ mp3_links = list(dict.fromkeys(matches))
 
38
 
39
+ # Application du nombre d'épisodes demandé
40
+ try:
41
+ num_episodes = int(num_episodes)
42
+ if num_episodes > 0:
43
+ mp3_links = mp3_links[:num_episodes]
44
+ except:
45
+ pass Si valeur invalide, on prend tout
 
 
 
46
 
47
  if not mp3_links:
48
+ return None, "Aucun épisode trouvé"
49
 
50
+ # Téléchargement
51
  temp_dir = tempfile.mkdtemp()
52
  filenames = []
53
 
 
68
  if not filenames:
69
  return None, "Échec du téléchargement"
70
 
71
+ # Création du ZIP
72
  zip_path = os.path.join(temp_dir, 'podcast.zip')
73
  with zipfile.ZipFile(zip_path, 'w') as zipf:
74
  for file in filenames:
 
76
 
77
  return zip_path, None
78
 
79
+ def download_podcast(url, num_episodes):
80
+ zip_path, error = process_url(url, num_episodes)
81
  if error:
82
  raise gr.Error(error)
83
  return zip_path
84
 
85
  with gr.Blocks() as app:
86
+ gr.Markdown("## 🎧 Téléchargeur Radio France - Contrôle des épisodes")
87
+
88
  with gr.Row():
89
  url_input = gr.Textbox(
90
  label="URL du podcast",
91
  placeholder="Ex: https://www.radiofrance.fr/...",
92
  max_lines=1
93
  )
94
+ num_input = gr.Number(
95
+ label="Nombre d'épisodes à télécharger (0 = tous)",
96
+ value=0,
97
+ minimum=0,
98
+ step=1,
99
+ precision=0
100
+ )
101
+
102
+ btn = gr.Button("Télécharger", variant="primary")
103
+ output = gr.File(label="Fichier ZIP résultant")
104
 
105
  examples = gr.Examples(
106
  examples=[[
107
+ "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin",
108
+ 4
109
  ]],
110
+ inputs=[url_input, num_input]
111
  )
112
 
113
  btn.click(
114
  fn=download_podcast,
115
+ inputs=[url_input, num_input],
116
  outputs=output,
117
  api_name="download"
118
  )