Spaces:

nateevo
/

Minuteevo

Runtime error

App Files Files Community

Minuteevo / app.py

andreinigo

Update app.py

f70d91e about 2 years ago

raw

history blame

6.98 kB

	import os
	import openai
	import re
	from os.path import splitext, exists
	import nltk
	from nltk.tokenize import word_tokenize
	import gradio as gr
	import backoff
	import markdown
	from docx import Document
	from io import StringIO
	from datetime import datetime
	import tempfile


	nltk.download('punkt')
	os.environ["OPENAI_API_KEY"] = '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'

	openai.api_key = os.getenv("OPENAI_API_KEY")


	def clean_webvtt(filepath: str) -> str:
	"""Clean up the content of a subtitle file (vtt) to a string

	Args:
	filepath (str): path to vtt file

	Returns:
	str: clean content
	"""
	# read file content
	with open(filepath, "r", encoding="utf-8") as fp:
	content = fp.read()

	# remove header & empty lines
	lines = [line.strip() for line in content.split("\n") if line.strip()]
	lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines

	# remove indexes
	lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]

	# remove tcode
	#pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
	pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
	lines = [lines[i] for i in range(len(lines))
	if not re.match(pattern, lines[i])]

	# remove timestamps
	pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
	lines = [lines[i] for i in range(len(lines))
	if not re.match(pattern, lines[i])]

	content = " ".join(lines)

	# remove duplicate spaces
	pattern = r"\s+"
	content = re.sub(pattern, r" ", content)

	# add space after punctuation marks if it doesn't exist
	pattern = r"([\.!?])(\w)"
	content = re.sub(pattern, r"\1 \2", content)

	return content


	def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
	"""Save clean content of a subtitle file to text file

	Args:
	file_in (str): path to vtt file
	file_out (None, optional): path to text file
	**kwargs (optional): arguments for other parameters
	- no_message (bool): do not show message of result.
	Default is False

	Returns:
	str: path to text file
	"""
	# set default values
	no_message = kwargs.get("no_message", False)
	if not file_out:
	filename = splitext(file_in)[0]
	file_out = "%s.txt" % filename
	i = 0
	while exists(file_out):
	i += 1
	file_out = "%s_%s.txt" % (filename, i)

	content = clean_webvtt(file_in)
	with open(file_out, "w+", encoding="utf-8") as fp:
	fp.write(content)
	if not no_message:
	print("clean content is written to file: %s" % file_out)

	return file_out


	def get_summary(filepath):
	filepath = filepath
	vtt_to_clean_file(filepath)


	def count_tokens(filename):
	with open(filename, 'r') as f:
	text = f.read()
	tokens = word_tokenize(text)
	return len(tokens)


	def break_up_file(tokens, chunk_size, overlap_size):
	if len(tokens) <= chunk_size:
	yield tokens
	else:
	chunk = tokens[:chunk_size]
	yield chunk
	yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)


	def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
	with open(filename, 'r') as f:
	text = f.read()
	tokens = word_tokenize(text)
	return list(break_up_file(tokens, chunk_size, overlap_size))


	def convert_to_prompt_text(tokenized_text):
	prompt_text = " ".join(tokenized_text)
	prompt_text = prompt_text.replace(" 's", "'s")
	return prompt_text


	def markdown_to_docx(md_text, output_file):
	# Convert the Markdown text to HTML
	html_text = markdown.markdown(md_text)

	# Create a new Document object
	doc = Document()

	# Parse the HTML and add its content to the .docx document
	for p in html_text.split('</p>'):
	if '<p>' in p:
	clean_p = p.replace('<p>', '').strip()
	if clean_p:
	doc.add_paragraph(clean_p)

	# Save the document to the specified file
	doc.save(output_file)


	@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
	@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
	def summarize_meeting(filepath):
	filename = filepath
	token_count = count_tokens(filename)

	prompt_response = []
	# Break the text of the meeting transcripts into chunks of 4000 tokens.
	chunks = break_up_file_to_chunks(filename)
	# Summarize each chunk.
	for i, chunk in enumerate(chunks):
	prompt_request = convert_to_prompt_text(chunks[i])

	messages = [
	{"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
	messages.append({"role": "user", "content": prompt_request})

	response = openai.ChatCompletion.create(
	model="gpt-4",
	messages=messages,
	temperature=.4,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)

	prompt_response.append(
	response["choices"][0]["message"]['content'].strip())

	# Consolidate these meeting summaries.
	prompt_request = "Consolidate these meeting summaries: " + \
	str(prompt_response)

	# Summarize the text of the meeting transcripts.
	messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
	messages.append({"role": "user", "content": prompt_request})
	response = openai.ChatCompletion.create(
	model="gpt-4",
	messages=messages,
	temperature=.4,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)

	summary_text = response["choices"][0]["message"]['content'].strip()
	#outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
	# Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
	#markdown_to_docx(
	# summary_text, outfilepath)


	return summary_text


	def summarize_meeting_vtt(file):
	temp_file_path = file.name
	summary_text = summarize_meeting(temp_file_path)

	return summary_text


	demo = gr.Interface(
	fn=summarize_meeting_vtt,
	# input
	inputs=gr.File(label="Archivo .vtt"),
	# output
	outputs=[
	gr.Markdown(label="Resumen de la reunión")
	],
	title="Diminuteevo - Ayudante para Minutas",
	description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.")


	if __name__ == "__main__":
	demo.launch()