awacke1 commited on
Commit
83acbfc
Β·
verified Β·
1 Parent(s): 5724ef4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -8,9 +8,9 @@ import os
8
  from mutagen.mp3 import MP3
9
  import cv2
10
  from dotenv import load_dotenv
11
- from transformers import pipeline
12
-
13
-
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -24,6 +24,22 @@ def resize(img_list):
24
  resize_img_list.append(np.array(imResize))
25
  return resize_img_list
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def merge_audio_video(entities_num, resize_img_list, text_input):
28
  speech = text2speech(text_input)
29
  wav_audio = AudioSegment.from_file(speech, "flac")
@@ -41,18 +57,6 @@ def merge_audio_video(entities_num, resize_img_list, text_input):
41
 
42
  return mergedclip
43
 
44
- def text2speech(text):
45
- # Generate speech from text using FastSpeech2
46
- speech_output = fastspeech(text)
47
- # Save the output as a .flac file (assuming the output is in numpy format)
48
- with open("speech_output.flac", "wb") as f:
49
- f.write(speech_output["audio"])
50
- return "speech_output.flac"
51
-
52
- # Load FastSpeech2 model from Hugging Face directly
53
- fastspeech = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech", use_auth_token=HF_TOKEN)
54
-
55
-
56
  def engine(text_input):
57
  ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
58
  entities = ner(text_input)
@@ -68,15 +72,10 @@ def engine(text_input):
68
 
69
  resize_img_list = resize(img_list)
70
  mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
71
- mergedclip.to_videofile('mergedvideo.mp4')
72
 
73
  return 'mergedvideo.mp4'
74
 
75
-
76
-
77
-
78
-
79
-
80
  app = gr.Interface(
81
  fn=engine,
82
  inputs=gr.Textbox(lines=5, label="Input Text"),
@@ -87,4 +86,4 @@ app = gr.Interface(
87
  ],
88
  title="AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨",
89
  article="<br><div></div>"
90
- ).launch(debug=True)
 
8
  from mutagen.mp3 import MP3
9
  import cv2
10
  from dotenv import load_dotenv
11
+ from transformers import pipeline, AutoProcessor, AutoModel
12
+ import torch
13
+ import soundfile as sf
14
 
15
  # Load environment variables
16
  load_dotenv()
 
24
  resize_img_list.append(np.array(imResize))
25
  return resize_img_list
26
 
27
+ def text2speech(text):
28
+ # Using Microsoft's SpeechT5 model instead of FastSpeech2
29
+ processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
30
+ model = AutoModel.from_pretrained("microsoft/speecht5_tts")
31
+
32
+ # Preprocessing text input
33
+ inputs = processor(text=text, return_tensors="pt")
34
+
35
+ # Generate speech with default speaker embedding
36
+ speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
37
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
38
+
39
+ # Save as flac file
40
+ sf.write("speech_output.flac", speech.numpy(), samplerate=16000)
41
+ return "speech_output.flac"
42
+
43
  def merge_audio_video(entities_num, resize_img_list, text_input):
44
  speech = text2speech(text_input)
45
  wav_audio = AudioSegment.from_file(speech, "flac")
 
57
 
58
  return mergedclip
59
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def engine(text_input):
61
  ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
62
  entities = ner(text_input)
 
72
 
73
  resize_img_list = resize(img_list)
74
  mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
75
+ mergedclip.write_videofile('mergedvideo.mp4')
76
 
77
  return 'mergedvideo.mp4'
78
 
 
 
 
 
 
79
  app = gr.Interface(
80
  fn=engine,
81
  inputs=gr.Textbox(lines=5, label="Input Text"),
 
86
  ],
87
  title="AI Pipeline Multi Model 🎭🎞️🍿 Movie Maker 🎬 🧠 🎨",
88
  article="<br><div></div>"
89
+ ).launch(debug=True)