File size: 11,306 Bytes
b2077e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31fc9dd
b2077e8
 
 
 
 
 
 
 
 
1a9b170
b2077e8
 
b577655
b2077e8
b577655
b2077e8
b577655
 
b2077e8
 
 
b577655
 
 
b2077e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64eaae5
b2077e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import os
import subprocess
import sys

# 1. Instalasi Dependencies (Pastikan ini dijalankan hanya jika diperlukan)
# Cek apakah dependencies sudah terinstall
def check_dependencies():
    try:
        import torch
        import transformers
        import datasets
        import librosa
        import numpy
        import scipy
        import ffmpeg
        import gradio
        import huggingface_hub
        return True
    except ImportError:
        return False

if not check_dependencies():
    # Install pytorch (CPU version)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==1.12.1+cpu", "torchvision==0.13.1+cpu", "torchaudio==0.12.1", "--extra-index-url", "https://download.pytorch.org/whl/cpu"])

    # Install other dependencies
    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers==4.24.0", "datasets==2.7.1", "librosa==0.9.2", "numpy==1.23.4", "scipy==1.9.3", "ffmpeg-python==0.2.0", "gradio==3.10.1", "huggingface_hub==0.11.0"])

    # Install non-pip dependencies
    os.system("apt-get update && apt-get install -y ffmpeg")

# 2. Impor Libraries
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset, Audio
import librosa
import numpy as np
from scipy.io import wavfile
import ffmpeg
import gradio as gr
from huggingface_hub import HfApi, HfFolder
from huggingface_hub import login 

# 3. Konfigurasi Hugging Face Hub
# Dapatkan token dari environment variable (lebih aman)
HF_TOKEN = os.environ.get("HF_TOKEN") # Gunakan secrets HF_TOKEN pada pengaturan HF Spaces

# Atau, jika Anda ingin hardcode token (tidak disarankan untuk production)
# HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN"

# Konfigurasi repository
repo_id = "Cun-Duck/Lipsync"  # Ganti dengan username dan nama repo Anda
model_filename = "lipsync_model.pth"

HF_TOKEN = os.environ.get("HF_TOKEN")  # Gunakan secrets HF_TOKEN pada pengaturan HF Spaces

# Login ke Hugging Face Hub
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Successfully logged in to Hugging Face Hub.")
else:
    print("HF_TOKEN not found. Model will not be uploaded.")

# Inisialisasi HfApi
api = HfApi()

# 4. Definisi Model dan Fungsi-Fungsi

# Model ASR (sama seperti sebelumnya)
asr_model_name = "facebook/wav2vec2-base-960h"
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)

# Placeholder untuk model lipsync (Model yang lebih ringan dan efisien)
class LipSyncModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Arsitektur yang lebih sederhana:
        self.fc1 = torch.nn.Linear(512, 256) # Reduced input features
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(256, 128 * 3 * 32 * 32) # Reduced output size

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = x.view(-1, 3, 32, 32) # Reduced frame size: 32x32
        return x

lipsync_model = LipSyncModel()
optimizer = torch.optim.Adam(lipsync_model.parameters(), lr=5e-5)
criterion = torch.nn.MSELoss()

# Fungsi untuk mengekstrak fitur audio (sama seperti sebelumnya)
def extract_audio_features(audio_file):
    audio, sr = librosa.load(audio_file, sr=asr_processor.feature_extractor.sampling_rate, mono=True) # Ensure mono audio
    inputs = asr_processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)

    with torch.no_grad():
      # Get hidden states from a specific layer (before the output layer)
      # Note: Wav2Vec2 might not provide hidden features directly.
      # You may need to modify the model to obtain the desired features.
      # Alternatively, use MFCCs:
      mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=16, hop_length=512)
      mfccs = torch.tensor(mfccs.T).float()[:512, :]  # Limit feature size, adjust as needed
      return mfccs

# Fungsi untuk memproses video dan audio (sama seperti sebelumnya)
def process_video(video_file, audio_file):
    # 1. Ekstrak audio dari video (jika video memiliki audio)
    if audio_file is None:
      try:
          audio_file = "temp_audio.wav"
          (
              ffmpeg.input(video_file)
              .output(audio_file, acodec="pcm_s16le", ar="16000", ac=1)  # Convert to mono
              .run(overwrite_output=True, quiet=True)
          )
      except ffmpeg.Error as e:
          print(f"Error extracting audio from {video_file}: {e.stderr.decode()}")
          return None, None

    # 2. Ekstrak frame dari video
    probe = ffmpeg.probe(video_file)
    video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
    width = int(video_info['width'])
    height = int(video_info['height'])
    num_frames = int(video_info['nb_frames'])
    fps = eval(video_info['r_frame_rate'])

    frames, _, _ = (
        ffmpeg.input(video_file)
        .output("pipe:", format="rawvideo", pix_fmt="rgb24", s="32x32") # Downsample to 32x32
        .run(capture_stdout=True, quiet=True)
    )
    frames = np.frombuffer(frames, np.uint8).reshape([-1, 32, 32, 3])
    frames = torch.tensor(frames).permute(0, 3, 1, 2).float() / 255.0

    # 3. Ekstrak fitur audio
    audio_features = extract_audio_features(audio_file)

    return frames, audio_features, fps

# Fungsi untuk melatih model lipsync
def train_lipsync_model(video_file, audio_file, epochs=5):
    frames, audio_features, fps = process_video(video_file, audio_file)

    if frames is None or audio_features is None:
      print("Skipping training due to error in video or audio processing.")
      return

    for epoch in range(epochs):
        optimizer.zero_grad()

        # Sesuaikan ukuran audio features
        num_frames = frames.shape[0]

        # Reduce the number of frames to match the audio features, if necessary
        if num_frames > audio_features.shape[0]:
          frames = frames[:audio_features.shape[0]]
          num_frames = audio_features.shape[0]

        # Pad audio features if they are shorter than the number of frames
        if audio_features.shape[0] < num_frames:
          padding_size = num_frames - audio_features.shape[0]
          padding = audio_features[-1,:].repeat(padding_size, 1)
          audio_features_padded = torch.cat((audio_features, padding), dim=0)
        else:
          audio_features_padded = audio_features

        # Generate video frame
        generated_frames = lipsync_model(audio_features_padded)

        # Hitung loss
        loss = criterion(generated_frames, frames)

        # Backpropagation dan optimasi
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

    # Simpan dan upload model setelah pelatihan
    if HF_TOKEN:
        save_and_upload_model()

# Fungsi untuk inference (sama seperti sebelumnya)
def lipsync_inference(video_file, audio_file, output_file="output.mp4"):
    frames, audio_features, fps = process_video(video_file, audio_file)

    if frames is None or audio_features is None:
      print("Error during video or audio processing.")
      return None

    with torch.no_grad():
      num_frames = frames.shape[0]

      # Reduce the number of frames to match the audio features, if necessary
      if num_frames > audio_features.shape[0]:
        frames = frames[:audio_features.shape[0]]
        num_frames = audio_features.shape[0]

      # Pad audio features if they are shorter than the number of frames
      if audio_features.shape[0] < num_frames:
        padding_size = num_frames - audio_features.shape[0]
        padding = audio_features[-1,:].repeat(padding_size, 1)
        audio_features_padded = torch.cat((audio_features, padding), dim=0)
      else:
        audio_features_padded = audio_features

      generated_frames = lipsync_model(audio_features_padded)

    # Convert tensor to numpy array
    generated_frames = (generated_frames * 255).byte().permute(0, 2, 3, 1).cpu().numpy()

    # Simpan video hasil inference
    temp_video = "temp_output.mp4"
    (
        ffmpeg.input(
            "pipe:",
            format="rawvideo",
            pix_fmt="rgb24",
            s=f"{generated_frames.shape[2]}x{generated_frames.shape[1]}",
            r=fps,
        )
        .output(temp_video, pix_fmt="yuv420p", vcodec="libx264", crf=28)
        .overwrite_output()
        .run(input=generated_frames.tobytes(), quiet=True)
    )

    # Gabungkan audio baru dengan video
    (
        ffmpeg.input(temp_video)
        .input(audio_file)
        .output(output_file, c="copy", map="0:v:0 1:a:0")
        .overwrite_output()
        .run(quiet=True)
    )

    os.remove(temp_video)
    print(f"Video hasil lipsync disimpan di: {output_file}")
    return output_file

# 5. Fungsi untuk menyimpan dan mengupload model
def save_and_upload_model():
    # Create repo if it doesn't exist
    try:
        api.create_repo(repo_id=repo_id, token=HF_TOKEN, private=True, exist_ok=True) # repo dibuat private agar lebih aman
    except Exception as e:
        print(f"Error creating repo: {e}")

    # Simpan model secara lokal
    torch.save(lipsync_model.state_dict(), model_filename)
    print(f"Model saved locally to {model_filename}")

    # Upload model ke Hugging Face Hub
    try:
        api.upload_file(
            path_or_fileobj=model_filename,
            path_in_repo=model_filename,
            repo_id=repo_id,
            token=HF_TOKEN,
        )
        print(f"Model uploaded to {repo_id}/{model_filename}")
    except Exception as e:
        print(f"Error uploading model: {e}")

# 6. Fungsi untuk mengunduh dan memuat model
def download_and_load_model():
  try:
      model_path = api.model_info(repo_id=repo_id, token=HF_TOKEN).siblings[0].rfilename
      api.download_file(
          path_or_fileobj=model_filename,
          path_in_repo=model_path,
          repo_id=repo_id,
          token=HF_TOKEN,
          local_dir="."
      )
      lipsync_model.load_state_dict(torch.load(model_filename))
      print("Model loaded from Hugging Face Hub")
  except Exception as e:
      print(f"Error loading model: {e}")
      print("Starting with a fresh model.")

# 7. Antarmuka Gradio
def run_app(input_video, input_audio, output_video):

    # Coba untuk load model dari HF Hub
    if HF_TOKEN:
        download_and_load_model()

    # save files to path
    input_video_path = "input_video.mp4"
    input_audio_path = "input_audio.wav"

    with open(input_video_path, "wb") as f:
      f.write(input_video.getbuffer())
    with open(input_audio_path, "wb") as f:
      f.write(input_audio.getbuffer())
    
    # Lakukan pelatihan selama 5 epoch
    train_lipsync_model(input_video_path, input_audio_path, epochs=5)

    output_video = lipsync_inference(input_video_path, input_audio_path, output_video)

    # remove files from path
    os.remove(input_video_path)
    os.remove(input_audio_path)

    return output_video

input_video = gr.inputs.Video(type="file", label="Input Video")
input_audio = gr.inputs.Audio(type="file", label="Input Audio")
output_video = "output_video.mp4"

iface = gr.Interface(
    fn=run_app,
    inputs=[input_video, input_audio],
    outputs="video",
    title="LipSync AI on CPU",
    description="Ubah audio dari video menggunakan AI Lipsync (CPU Version).",
)

iface.launch(debug=True)