pradeep4321 commited on
Commit
d22d3f8
·
verified ·
1 Parent(s): b2cfb3e

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +60 -71
src/app.py CHANGED
@@ -1,74 +1,63 @@
1
  import streamlit as st
2
- from docx import Document
3
- from PyPDF2 import PdfReader
4
- from io import BytesIO
5
- import torch
6
- import torchaudio
7
- import soundfile as sf
8
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
9
 
10
- # Load TTS model and processor
11
  @st.cache_resource
12
- def load_model():
13
- model = AutoModelForSpeechSeq2Seq.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
14
- processor = AutoProcessor.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
15
- return model, processor
16
-
17
- def convert_docx_to_text(docx_file):
18
- doc = Document(docx_file)
19
- return "\n".join([p.text for p in doc.paragraphs])
20
-
21
- def convert_pdf_to_text(pdf_file):
22
- reader = PdfReader(pdf_file)
23
- return "\n".join([page.extract_text() or '' for page in reader.pages])
24
-
25
- def text_to_speech(text, model, processor):
26
- inputs = processor(text, return_tensors="pt")
27
- with torch.no_grad():
28
- speech = model.generate(**inputs)
29
-
30
- waveform = speech.squeeze().cpu().numpy()
31
- buffer = BytesIO()
32
- sf.write(buffer, waveform, 22050, format="WAV")
33
- buffer.seek(0)
34
- return buffer
35
-
36
- def get_download_link(audio_buffer, filename="output.wav"):
37
- b64 = st.base64.b64encode(audio_buffer.getvalue()).decode()
38
- href = f'<a href="data:audio/wav;base64,{b64}" download="{filename}">Download {filename}</a>'
39
- return href
40
-
41
- def main():
42
- st.title("Text to Speech with Transformers (Offline Hugging Face)")
43
-
44
- uploaded_file = st.file_uploader("Upload a TXT, DOCX, or PDF file", type=["txt", "docx", "pdf"])
45
-
46
- if uploaded_file:
47
- ext = uploaded_file.name.split('.')[-1].lower()
48
-
49
- if ext == 'txt':
50
- text = uploaded_file.read().decode("utf-8")
51
- elif ext == 'docx':
52
- text = convert_docx_to_text(uploaded_file)
53
- elif ext == 'pdf':
54
- text = convert_pdf_to_text(uploaded_file)
55
- else:
56
- st.error("Unsupported file type")
57
- return
58
-
59
- if not text.strip():
60
- st.warning("No readable text found.")
61
- return
62
-
63
- st.subheader("Extracted Text:")
64
- st.write(text[:1000] + ("..." if len(text) > 1000 else ""))
65
-
66
- with st.spinner("Generating audio..."):
67
- model, processor = load_model()
68
- audio_buffer = text_to_speech(text, model, processor)
69
-
70
- st.audio(audio_buffer, format="audio/wav")
71
- st.markdown(get_download_link(audio_buffer), unsafe_allow_html=True)
72
-
73
- if __name__ == "__main__":
74
- main()
 
1
  import streamlit as st
2
+ from TTS.api import TTS
3
+ import tempfile
4
+ import os
 
 
 
 
5
 
6
+ # Initialize TTS model (only once)
7
  @st.cache_resource
8
+ def load_tts_model():
9
+ return TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
10
+
11
+ tts = load_tts_model()
12
+
13
+ # App title
14
+ st.title("🔊 Voice Cloning with XTTS v2")
15
+
16
+ # Text input
17
+ text_input = st.text_area("Enter the text you want to synthesize", height=150)
18
+
19
+ # Speaker file uploader
20
+ speaker_file = st.file_uploader("Upload a speaker WAV file", type=["wav"])
21
+
22
+ # Button to generate
23
+ if st.button("Generate Speech"):
24
+ if not text_input:
25
+ st.error("Please enter text.")
26
+ elif not speaker_file:
27
+ st.error("Please upload a speaker WAV file.")
28
+ else:
29
+ try:
30
+ with st.spinner("Generating voice..."):
31
+ # Save uploaded speaker audio temporarily
32
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as speaker_temp:
33
+ speaker_temp.write(speaker_file.read())
34
+ speaker_path = speaker_temp.name
35
+
36
+ # Temporary file to store output
37
+ output_path = os.path.join(tempfile.gettempdir(), "output.wav")
38
+
39
+ # Generate speech
40
+ tts.tts_to_file(
41
+ text=text_input,
42
+ file_path=output_path,
43
+ speaker_wav=speaker_path,
44
+ language="en"
45
+ )
46
+
47
+ # Playback
48
+ st.audio(output_path, format="audio/wav")
49
+
50
+ # Download link
51
+ with open(output_path, "rb") as f:
52
+ st.download_button(
53
+ label="Download Audio",
54
+ data=f,
55
+ file_name="cloned_voice.wav",
56
+ mime="audio/wav"
57
+ )
58
+
59
+ # Clean up
60
+ os.remove(speaker_path)
61
+
62
+ except Exception as e:
63
+ st.error(f"An error occurred: {e}")