Spaces:

sinabis
/

transcription_service

Sleeping

App Files Files Community

transcription_service / src /runners /transcription_zipper.py

aleger

add bentoml files

d8d26b1 over 1 year ago

raw

history blame

3.88 kB

	import io
	import os.path
	import zipfile
	import bentoml
	import tempfile
	from pathlib import Path

	from docx.enum.text import WD_COLOR_INDEX


	class TranscriptionZipper(bentoml.Runnable):
	SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
	SUPPORTS_CPU_MULTI_THREADING = True

	@bentoml.Runnable.method(batchable=False)
	def zip_transcription(self, transcription_list):
	zip_buffer = io.BytesIO()

	for t_list in transcription_list:
	orig_filename = t_list[0]
	if ".mp3" in orig_filename:
	orig_filename = orig_filename.removesuffix(".mp3")
	else:
	orig_filename = orig_filename.removesuffix(".wav")

	new_content = create_word_content(orig_filename, t_list[1], t_list[2])
	new_content.save(orig_filename + '.docx')
	# new_content = create_content(t_list[1], t_list[2]) # html string

	with zipfile.ZipFile(zip_buffer, "a") as zip_file:
	# zip_file.writestr(file_name + ".html", new_content)
	zip_file.write(orig_filename + '.docx')

	# Return the zip file as bytes
	return zip_buffer.getvalue()


	def create_word_content(filename, old_content, new_content):
	from docx import Document

	document = Document()

	document.add_heading(filename, 1)

	p = document.add_paragraph()
	run = p.add_run()
	run.add_break()

	old_content = old_content.split(" ")
	changed_content = new_content.split(" ")
	both = [word for word in changed_content if word in old_content]
	i = 0

	while i < len(changed_content):
	try:
	if changed_content[i] == old_content[i]:
	p.add_run(" " + changed_content[i])
	# new_content += " " + changed_content[i]
	both.pop(0)
	old_content.pop(0)
	changed_content.pop(0)
	else:
	old_pos = old_content.index(both[0])
	new_pos = changed_content.index(both[0])
	p.add_run(" " + " ".join(old_content[0:old_pos])).font.strike = True
	p.add_run(" " + " ".join(changed_content[0:new_pos])).font.highlight_color = WD_COLOR_INDEX.YELLOW
	# new_content += " <s>" + " ".join(old_content[0:old_pos]) + "</s> "
	# new_content += " ".join(changed_content[0:new_pos])
	del old_content[0:old_pos]
	del changed_content[0:new_pos]
	except:
	p.add_run(" ".join(old_content[i:])).font.strike = True
	p.add_run(" ".join(changed_content[i:])).font.highlight_color = WD_COLOR_INDEX.YELLOW
	# new_content += " <s>" + " ".join(old_content[i:]) + "</s> "
	# new_content += " " + " ".join(changed_content[i:])
	break

	return document


	def create_content(old_content, new_content):
	old_content = old_content.split(" ")
	changed_content = new_content.split(" ")
	both = [word for word in changed_content if word in old_content]
	new_content = ""
	i = 0

	while i < len(changed_content):
	try:
	if changed_content[i] == old_content[i]:
	new_content += " " + changed_content[i]
	both.pop(0)
	old_content.pop(0)
	changed_content.pop(0)
	else:
	old_pos = old_content.index(both[0])
	new_pos = changed_content.index(both[0])
	new_content += " <s>" + " ".join(old_content[0:old_pos]) + "</s> "
	new_content += " ".join(changed_content[0:new_pos])
	del old_content[0:old_pos]
	del changed_content[0:new_pos]
	except:
	new_content += " <s>" + " ".join(old_content[i:]) + "</s> "
	new_content += " " + " ".join(changed_content[i:])
	break

	return new_content