Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ import os
|
|
8 |
from mutagen.mp3 import MP3
|
9 |
import cv2
|
10 |
from dotenv import load_dotenv
|
11 |
-
from transformers import pipeline
|
12 |
-
|
13 |
-
|
14 |
|
15 |
# Load environment variables
|
16 |
load_dotenv()
|
@@ -24,6 +24,22 @@ def resize(img_list):
|
|
24 |
resize_img_list.append(np.array(imResize))
|
25 |
return resize_img_list
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def merge_audio_video(entities_num, resize_img_list, text_input):
|
28 |
speech = text2speech(text_input)
|
29 |
wav_audio = AudioSegment.from_file(speech, "flac")
|
@@ -41,18 +57,6 @@ def merge_audio_video(entities_num, resize_img_list, text_input):
|
|
41 |
|
42 |
return mergedclip
|
43 |
|
44 |
-
def text2speech(text):
|
45 |
-
# Generate speech from text using FastSpeech2
|
46 |
-
speech_output = fastspeech(text)
|
47 |
-
# Save the output as a .flac file (assuming the output is in numpy format)
|
48 |
-
with open("speech_output.flac", "wb") as f:
|
49 |
-
f.write(speech_output["audio"])
|
50 |
-
return "speech_output.flac"
|
51 |
-
|
52 |
-
# Load FastSpeech2 model from Hugging Face directly
|
53 |
-
fastspeech = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech", use_auth_token=HF_TOKEN)
|
54 |
-
|
55 |
-
|
56 |
def engine(text_input):
|
57 |
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
|
58 |
entities = ner(text_input)
|
@@ -68,15 +72,10 @@ def engine(text_input):
|
|
68 |
|
69 |
resize_img_list = resize(img_list)
|
70 |
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
|
71 |
-
mergedclip.
|
72 |
|
73 |
return 'mergedvideo.mp4'
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
app = gr.Interface(
|
81 |
fn=engine,
|
82 |
inputs=gr.Textbox(lines=5, label="Input Text"),
|
@@ -87,4 +86,4 @@ app = gr.Interface(
|
|
87 |
],
|
88 |
title="AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨",
|
89 |
article="<br><div></div>"
|
90 |
-
).launch(debug=True)
|
|
|
8 |
from mutagen.mp3 import MP3
|
9 |
import cv2
|
10 |
from dotenv import load_dotenv
|
11 |
+
from transformers import pipeline, AutoProcessor, AutoModel
|
12 |
+
import torch
|
13 |
+
import soundfile as sf
|
14 |
|
15 |
# Load environment variables
|
16 |
load_dotenv()
|
|
|
24 |
resize_img_list.append(np.array(imResize))
|
25 |
return resize_img_list
|
26 |
|
27 |
+
def text2speech(text):
|
28 |
+
# Using Microsoft's SpeechT5 model instead of FastSpeech2
|
29 |
+
processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
|
30 |
+
model = AutoModel.from_pretrained("microsoft/speecht5_tts")
|
31 |
+
|
32 |
+
# Preprocessing text input
|
33 |
+
inputs = processor(text=text, return_tensors="pt")
|
34 |
+
|
35 |
+
# Generate speech with default speaker embedding
|
36 |
+
speaker_embeddings = torch.zeros((1, model.config.speaker_embedding_size))
|
37 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
|
38 |
+
|
39 |
+
# Save as flac file
|
40 |
+
sf.write("speech_output.flac", speech.numpy(), samplerate=16000)
|
41 |
+
return "speech_output.flac"
|
42 |
+
|
43 |
def merge_audio_video(entities_num, resize_img_list, text_input):
|
44 |
speech = text2speech(text_input)
|
45 |
wav_audio = AudioSegment.from_file(speech, "flac")
|
|
|
57 |
|
58 |
return mergedclip
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def engine(text_input):
|
61 |
ner = gr.Interface.load("huggingface/flair/ner-english-ontonotes-large", api_key=HF_TOKEN)
|
62 |
entities = ner(text_input)
|
|
|
72 |
|
73 |
resize_img_list = resize(img_list)
|
74 |
mergedclip = merge_audio_video(entities_num, resize_img_list, text_input)
|
75 |
+
mergedclip.write_videofile('mergedvideo.mp4')
|
76 |
|
77 |
return 'mergedvideo.mp4'
|
78 |
|
|
|
|
|
|
|
|
|
|
|
79 |
app = gr.Interface(
|
80 |
fn=engine,
|
81 |
inputs=gr.Textbox(lines=5, label="Input Text"),
|
|
|
86 |
],
|
87 |
title="AI Pipeline Multi Model πποΈπΏ Movie Maker π¬ π§ π¨",
|
88 |
article="<br><div></div>"
|
89 |
+
).launch(debug=True)
|