transcription_service / src /runners /transcription_zipper.py
aleger's picture
add bentoml files
d8d26b1
raw
history blame
3.88 kB
import io
import os.path
import zipfile
import bentoml
import tempfile
from pathlib import Path
from docx.enum.text import WD_COLOR_INDEX
class TranscriptionZipper(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
SUPPORTS_CPU_MULTI_THREADING = True
@bentoml.Runnable.method(batchable=False)
def zip_transcription(self, transcription_list):
zip_buffer = io.BytesIO()
for t_list in transcription_list:
orig_filename = t_list[0]
if ".mp3" in orig_filename:
orig_filename = orig_filename.removesuffix(".mp3")
else:
orig_filename = orig_filename.removesuffix(".wav")
new_content = create_word_content(orig_filename, t_list[1], t_list[2])
new_content.save(orig_filename + '.docx')
# new_content = create_content(t_list[1], t_list[2]) # html string
with zipfile.ZipFile(zip_buffer, "a") as zip_file:
# zip_file.writestr(file_name + ".html", new_content)
zip_file.write(orig_filename + '.docx')
# Return the zip file as bytes
return zip_buffer.getvalue()
def create_word_content(filename, old_content, new_content):
from docx import Document
document = Document()
document.add_heading(filename, 1)
p = document.add_paragraph()
run = p.add_run()
run.add_break()
old_content = old_content.split(" ")
changed_content = new_content.split(" ")
both = [word for word in changed_content if word in old_content]
i = 0
while i < len(changed_content):
try:
if changed_content[i] == old_content[i]:
p.add_run(" " + changed_content[i])
# new_content += " " + changed_content[i]
both.pop(0)
old_content.pop(0)
changed_content.pop(0)
else:
old_pos = old_content.index(both[0])
new_pos = changed_content.index(both[0])
p.add_run(" " + " ".join(old_content[0:old_pos])).font.strike = True
p.add_run(" " + " ".join(changed_content[0:new_pos])).font.highlight_color = WD_COLOR_INDEX.YELLOW
# new_content += " <s>" + " ".join(old_content[0:old_pos]) + "</s> "
# new_content += " ".join(changed_content[0:new_pos])
del old_content[0:old_pos]
del changed_content[0:new_pos]
except:
p.add_run(" ".join(old_content[i:])).font.strike = True
p.add_run(" ".join(changed_content[i:])).font.highlight_color = WD_COLOR_INDEX.YELLOW
# new_content += " <s>" + " ".join(old_content[i:]) + "</s> "
# new_content += " " + " ".join(changed_content[i:])
break
return document
def create_content(old_content, new_content):
old_content = old_content.split(" ")
changed_content = new_content.split(" ")
both = [word for word in changed_content if word in old_content]
new_content = ""
i = 0
while i < len(changed_content):
try:
if changed_content[i] == old_content[i]:
new_content += " " + changed_content[i]
both.pop(0)
old_content.pop(0)
changed_content.pop(0)
else:
old_pos = old_content.index(both[0])
new_pos = changed_content.index(both[0])
new_content += " <s>" + " ".join(old_content[0:old_pos]) + "</s> "
new_content += " ".join(changed_content[0:new_pos])
del old_content[0:old_pos]
del changed_content[0:new_pos]
except:
new_content += " <s>" + " ".join(old_content[i:]) + "</s> "
new_content += " " + " ".join(changed_content[i:])
break
return new_content