File size: 5,466 Bytes
93c029f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
from pytube import YouTube
from src.music.utils import RATE_AUDIO_SAVE, slugify
from src.music.config import MAX_LEN

# define filtering keyworfds
start_keywords = [' ', '(', ',', ':']
end_keywords = [')', ' ', '.', ',', '!', ':']
def get_all_keywords(k):
    all_keywords = []
    for s in start_keywords:
        for e in end_keywords:
            all_keywords.append(s + k + e)
    return all_keywords
filtered_keywords = ['duet', 'duo', 'quartet', 'orchestre', 'orchestra',
                     'quintet', 'sixtet', 'septet', 'octet', 'backing track', 'accompaniment', 'string',
                     'contrebrasse', 'drums', 'guitar'] + get_all_keywords('live') + get_all_keywords('trio')

# list of playlist for which no filtering should occur on keywords (they were prefiltered already, it's supposed to be only piano)
playlist_and_channel_not_to_filter = ["https://www.youtube.com/c/MySheetMusicTranscriptions",
                                      "https://www.youtube.com/c/PianoNotion",
                                      "https://www.youtube.com/c/PianoNotion",
                                      "https://www.youtube.com/watch?v=3F5glYefwio&list=PLFv3ZQw-ZPxi2DH3Bau7lBC5K6zfPJZxc",
                                      "https://www.youtube.com/user/Mercuziopianist",
                                      "https://www.youtube.com/channel/UCy6NPK6-xeX7MZLaMARa5qg",
                                      "https://www.youtube.com/channel/UCKMRNFV2dWTWIJnymtA9_Iw",
                                      "https://www.youtube.com/c/pianomaedaful",
                                      "https://www.youtube.com/c/FrancescoParrinoMusic",
                                      "https://www.youtube.com/c/itsremco"]
playlist_ok = "https://www.youtube.com/watch?v=sYv_vk6bJtk&list=PLO9E3V4rGLD9-0BEd3t-AvvMcVF1zOJPj"


def should_be_filtered(title, length, url, playlist_url, max_length):
    to_filter = False
    reason = ''
    lower_title = title.lower()
    if length > max_length:
        reason += f'it is too long (>{max_length/60:.1f} min), '
        to_filter = True
    if any([f in lower_title for f in filtered_keywords]) \
            and playlist_url not in playlist_and_channel_not_to_filter \
            and 'to live' not in lower_title and 'alive' not in lower_title \
            and url not in playlist_ok:
        reason += 'it contains a filtered keyword, '
        to_filter = True
    return to_filter, reason

def convert_mp4_to_mp3(path, verbose=True):
    if verbose: print(f"Converting mp4 to mp3, in {path}\n")
    assert '.mp4' == path[-4:]
    os.system(f'ffmpeg -i "{path}" -loglevel panic -y -ac 1 -ar {int(RATE_AUDIO_SAVE)} "{path[:-4] + ".mp3"}" ')
    os.remove(path)
    if verbose: print('\tDone.')

def pipeline_video(video, playlist_path, filename):
    # extract best stream for this video
    stream, kbps = extract_best_stream(video.streams)
    stream.download(output_path=playlist_path, filename=filename + '.mp4')
    # convert to mp3
    convert_mp4_to_mp3(playlist_path + filename + '.mp4', verbose=False)
    return kbps

def extract_best_stream(streams):
    # extract best audio stream
    stream_out = streams.get_audio_only()
    kbps = int(stream_out.abr[:-4])
    return stream_out, kbps

def get_title_and_length(video):
    title = video.title
    filename = slugify(title)
    length = video.length
    return title, filename, length, video.metadata


def url2audio(playlist_path, video_url=None, video=None, playlist_url='', apply_filters=False, verbose=False, level=0):
    assert video_url is not None or video is not None, 'needs either video or url'
    error_msg = 'Error in loading video?'
    try:
        if not video:
            video = YouTube(video_url)
        error_msg += ' Nope. In extracting title and length?'
        title, filename, length, video_meta_data = get_title_and_length(video)
        if apply_filters:
            to_filter, reason = should_be_filtered(title, length, video_url, playlist_url, MAX_LEN)
        else:
            to_filter = False
        if not to_filter:
            audio_path = playlist_path + filename + ".mp3"
            if verbose: print(' ' * level + f'Downloading {title}, Url: {video_url}')
            if not os.path.exists(audio_path):
                if length > MAX_LEN and verbose: print(' ' * (level + 2) + f'Long video ({int(length/60)} min), will be cut after {int(MAX_LEN/60)} min.')
                error_msg += ' Nope. In pipeline video?'
                kbps = None
                for _ in range(5):
                    try:
                        kbps = pipeline_video(video, playlist_path, filename)
                        break
                    except:
                        pass
                assert kbps is not None
                error_msg += ' Nope. In dict filling?'
                data = dict(title=title, filename=filename, length=length, kbps=kbps, url=video_url, meta=video_meta_data)
                error_msg += ' Nope. '
            else:
                if verbose: print(' ' * (level + 2) + 'Song already downloaded')
                data = None
            return audio_path, data, ''
        else:
            return None, None, f'Filtered because {reason}'
    except:
        if verbose: print(' ' * (level + 2) + f'Download failed with error {error_msg}')
        if os.path.exists(audio_path):
            os.remove(audio_path)
        return None, None, error_msg + ' Yes.'