Ribot commited on
Commit
0a6bdb5
·
verified ·
1 Parent(s): 88da9f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -26
app.py CHANGED
@@ -4,11 +4,15 @@ from bs4 import BeautifulSoup
4
  import os
5
  import zipfile
6
  import tempfile
 
7
  from urllib.parse import urljoin
8
 
9
  def process_url(url):
10
  try:
11
- response = requests.get(url)
 
 
 
12
  response.raise_for_status()
13
  except requests.RequestException as e:
14
  return None, f"Erreur lors de la récupération de la page : {e}"
@@ -16,42 +20,59 @@ def process_url(url):
16
  soup = BeautifulSoup(response.text, 'html.parser')
17
 
18
  mp3_links = []
19
- for link in soup.find_all('a', href=True):
20
- href = link['href']
21
- if href.lower().endswith('.mp3'):
22
- absolute_url = urljoin(response.url, href)
23
- mp3_links.append(absolute_url)
 
 
 
 
24
 
25
- # Supprimer les doublons en conservant l'ordre
26
- seen = set()
27
- mp3_links = [x for x in mp3_links if not (x in seen or seen.add(x))]
 
 
 
 
 
 
 
 
 
28
 
29
  if not mp3_links:
30
- return None, "Aucun lien MP3 trouvé sur la page."
31
 
32
  temp_dir = tempfile.mkdtemp()
33
  filenames = []
 
34
  for idx, mp3_url in enumerate(mp3_links, start=1):
35
  try:
36
- mp3_response = requests.get(mp3_url)
37
- mp3_response.raise_for_status()
38
- filename = os.path.join(temp_dir, f"{idx:02d}_{os.path.basename(mp3_url)}")
39
- with open(filename, 'wb') as f:
40
- f.write(mp3_response.content)
41
- filenames.append(filename)
42
- except requests.RequestException as e:
43
- print(f"Erreur de téléchargement {mp3_url}: {e}")
 
 
 
44
  continue
45
 
46
  if not filenames:
47
- return None, "Aucun épisode téléchargé."
48
 
49
- zip_filename = os.path.join(temp_dir, 'podcast_episodes.zip')
50
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
51
  for file in filenames:
52
  zipf.write(file, arcname=os.path.basename(file))
53
 
54
- return zip_filename, None
55
 
56
  def download_podcast(url):
57
  zip_path, error = process_url(url)
@@ -61,10 +82,13 @@ def download_podcast(url):
61
 
62
  iface = gr.Interface(
63
  fn=download_podcast,
64
- inputs=gr.Textbox(label="URL de la page du podcast", placeholder="https://www.radiofrance.fr/..."),
65
- outputs=gr.File(label="Télécharger le ZIP des épisodes"),
66
- title="Téléchargeur de Podcast",
67
- description="Entrez l'URL d'une page contenant des épisodes de podcast pour télécharger tous les MP3 dans un ZIP ordonné."
 
 
 
68
  )
69
 
70
  iface.launch()
 
4
  import os
5
  import zipfile
6
  import tempfile
7
+ import re
8
  from urllib.parse import urljoin
9
 
10
  def process_url(url):
11
  try:
12
+ headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
14
+ }
15
+ response = requests.get(url, headers=headers)
16
  response.raise_for_status()
17
  except requests.RequestException as e:
18
  return None, f"Erreur lors de la récupération de la page : {e}"
 
20
  soup = BeautifulSoup(response.text, 'html.parser')
21
 
22
  mp3_links = []
23
+
24
+ # Recherche dans les balises audio et les divs de podcast
25
+ for audio_tag in soup.find_all('audio'):
26
+ source = audio_tag.find('source')
27
+ if source and 'src' in source.attrs:
28
+ mp3_url = source['src']
29
+ if '.mp3' in mp3_url:
30
+ absolute_url = urljoin(response.url, mp3_url.split('?')[0]) # Nettoyer l'URL
31
+ mp3_links.append(absolute_url)
32
 
33
+ # Recherche alternative dans les données JSON
34
+ script_tags = soup.find_all('script', type='application/ld+json')
35
+ for script in script_tags:
36
+ content = script.string
37
+ if content and '"episode"' in content:
38
+ matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+\.mp3[^"]*)"', content)
39
+ for match in matches:
40
+ absolute_url = urljoin(response.url, match.split('?')[0])
41
+ mp3_links.append(absolute_url)
42
+
43
+ # Suppression des doublons
44
+ mp3_links = list(dict.fromkeys(mp3_links))
45
 
46
  if not mp3_links:
47
+ return None, "Aucun lien MP3 trouvé - Structure de page non reconnue"
48
 
49
  temp_dir = tempfile.mkdtemp()
50
  filenames = []
51
+
52
  for idx, mp3_url in enumerate(mp3_links, start=1):
53
  try:
54
+ filename = f"{idx:02d}_{os.path.basename(mp3_url).split('?')[0]}"
55
+ filepath = os.path.join(temp_dir, filename)
56
+
57
+ with requests.get(mp3_url, headers=headers, stream=True) as r:
58
+ r.raise_for_status()
59
+ with open(filepath, 'wb') as f:
60
+ for chunk in r.iter_content(chunk_size=8192):
61
+ f.write(chunk)
62
+ filenames.append(filepath)
63
+ except Exception as e:
64
+ print(f"Erreur sur {mp3_url}: {str(e)}")
65
  continue
66
 
67
  if not filenames:
68
+ return None, "Échec du téléchargement des fichiers"
69
 
70
+ zip_path = os.path.join(temp_dir, 'podcast.zip')
71
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
72
  for file in filenames:
73
  zipf.write(file, arcname=os.path.basename(file))
74
 
75
+ return zip_path, None
76
 
77
  def download_podcast(url):
78
  zip_path, error = process_url(url)
 
82
 
83
  iface = gr.Interface(
84
  fn=download_podcast,
85
+ inputs=gr.Textbox(label="URL du podcast Radio France", placeholder="https://www.radiofrance.fr/..."),
86
+ outputs=gr.File(label="Télécharger les épisodes"),
87
+ title="Téléchargeur Radio France",
88
+ examples=[[
89
+ "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
90
+ ]],
91
+ description="Collez ici l'URL d'une série de podcasts Radio France pour récupérer tous les épisodes MP3"
92
  )
93
 
94
  iface.launch()