janak/no-audio-fusion
#35
by
janak22
- opened
- modeling_minicpmo.py +11 -3
modeling_minicpmo.py
CHANGED
@@ -1730,8 +1730,11 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|
1730 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
1731 |
|
1732 |
else:
|
1733 |
-
prev_wav = wav_np
|
1734 |
-
|
|
|
|
|
|
|
1735 |
if outputs.finished:
|
1736 |
logger.debug("Generation finished.")
|
1737 |
eos_lab = True
|
@@ -1828,6 +1831,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|
1828 |
prev_text_len = len(gen_text_raw)
|
1829 |
yield OmniOutput(text=cur_text, audio_wav=wav_y, sampling_rate=sr)
|
1830 |
else:
|
|
|
1831 |
prev_wav = wav_np
|
1832 |
else:
|
1833 |
# smooth wav
|
@@ -1839,7 +1843,11 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|
1839 |
prev_text_len = len(gen_text_raw)
|
1840 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
1841 |
else:
|
1842 |
-
prev_wav = wav_np
|
|
|
|
|
|
|
|
|
1843 |
|
1844 |
if outputs.finished:
|
1845 |
logger.debug("Generation finished.")
|
|
|
1730 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
1731 |
|
1732 |
else:
|
1733 |
+
prev_wav = wav_np[-512 * 4:]
|
1734 |
+
wav_np = wav_np[:-512 * 4]
|
1735 |
+
cur_text = gen_text_raw[prev_text_len:]
|
1736 |
+
prev_text_len = len(gen_text_raw)
|
1737 |
+
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
1738 |
if outputs.finished:
|
1739 |
logger.debug("Generation finished.")
|
1740 |
eos_lab = True
|
|
|
1831 |
prev_text_len = len(gen_text_raw)
|
1832 |
yield OmniOutput(text=cur_text, audio_wav=wav_y, sampling_rate=sr)
|
1833 |
else:
|
1834 |
+
|
1835 |
prev_wav = wav_np
|
1836 |
else:
|
1837 |
# smooth wav
|
|
|
1843 |
prev_text_len = len(gen_text_raw)
|
1844 |
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
1845 |
else:
|
1846 |
+
prev_wav = wav_np[-512 * 4:]
|
1847 |
+
wav_np = wav_np[:-512 * 4]
|
1848 |
+
cur_text = gen_text_raw[prev_text_len:]
|
1849 |
+
prev_text_len = len(gen_text_raw)
|
1850 |
+
yield OmniOutput(text=cur_text, audio_wav=wav_np, sampling_rate=sr)
|
1851 |
|
1852 |
if outputs.finished:
|
1853 |
logger.debug("Generation finished.")
|