Spaces:
Sleeping
Sleeping
import io | |
import os.path | |
import zipfile | |
import bentoml | |
import tempfile | |
from pathlib import Path | |
from docx.enum.text import WD_COLOR_INDEX | |
class TranscriptionZipper(bentoml.Runnable): | |
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") | |
SUPPORTS_CPU_MULTI_THREADING = True | |
def zip_transcription(self, transcription_list): | |
zip_buffer = io.BytesIO() | |
for t_list in transcription_list: | |
orig_filename = t_list[0] | |
if ".mp3" in orig_filename: | |
orig_filename = orig_filename.removesuffix(".mp3") | |
else: | |
orig_filename = orig_filename.removesuffix(".wav") | |
new_content = create_word_content(orig_filename, t_list[1], t_list[2]) | |
new_content.save(orig_filename + '.docx') | |
# new_content = create_content(t_list[1], t_list[2]) # html string | |
with zipfile.ZipFile(zip_buffer, "a") as zip_file: | |
# zip_file.writestr(file_name + ".html", new_content) | |
zip_file.write(orig_filename + '.docx') | |
# Return the zip file as bytes | |
return zip_buffer.getvalue() | |
def create_word_content(filename, old_content, new_content): | |
from docx import Document | |
document = Document() | |
document.add_heading(filename, 1) | |
p = document.add_paragraph() | |
run = p.add_run() | |
run.add_break() | |
old_content = old_content.split(" ") | |
changed_content = new_content.split(" ") | |
both = [word for word in changed_content if word in old_content] | |
i = 0 | |
while i < len(changed_content): | |
try: | |
if changed_content[i] == old_content[i]: | |
p.add_run(" " + changed_content[i]) | |
# new_content += " " + changed_content[i] | |
both.pop(0) | |
old_content.pop(0) | |
changed_content.pop(0) | |
else: | |
old_pos = old_content.index(both[0]) | |
new_pos = changed_content.index(both[0]) | |
p.add_run(" " + " ".join(old_content[0:old_pos])).font.strike = True | |
p.add_run(" " + " ".join(changed_content[0:new_pos])).font.highlight_color = WD_COLOR_INDEX.YELLOW | |
# new_content += " <s>" + " ".join(old_content[0:old_pos]) + "</s> " | |
# new_content += " ".join(changed_content[0:new_pos]) | |
del old_content[0:old_pos] | |
del changed_content[0:new_pos] | |
except: | |
p.add_run(" ".join(old_content[i:])).font.strike = True | |
p.add_run(" ".join(changed_content[i:])).font.highlight_color = WD_COLOR_INDEX.YELLOW | |
# new_content += " <s>" + " ".join(old_content[i:]) + "</s> " | |
# new_content += " " + " ".join(changed_content[i:]) | |
break | |
return document | |
def create_content(old_content, new_content): | |
old_content = old_content.split(" ") | |
changed_content = new_content.split(" ") | |
both = [word for word in changed_content if word in old_content] | |
new_content = "" | |
i = 0 | |
while i < len(changed_content): | |
try: | |
if changed_content[i] == old_content[i]: | |
new_content += " " + changed_content[i] | |
both.pop(0) | |
old_content.pop(0) | |
changed_content.pop(0) | |
else: | |
old_pos = old_content.index(both[0]) | |
new_pos = changed_content.index(both[0]) | |
new_content += " <s>" + " ".join(old_content[0:old_pos]) + "</s> " | |
new_content += " ".join(changed_content[0:new_pos]) | |
del old_content[0:old_pos] | |
del changed_content[0:new_pos] | |
except: | |
new_content += " <s>" + " ".join(old_content[i:]) + "</s> " | |
new_content += " " + " ".join(changed_content[i:]) | |
break | |
return new_content | |