Minuteevo / app.py
andreinigo's picture
Update app.py
f70d91e
raw
history blame
6.98 kB
import os
import openai
import re
from os.path import splitext, exists
import nltk
from nltk.tokenize import word_tokenize
import gradio as gr
import backoff
import markdown
from docx import Document
from io import StringIO
from datetime import datetime
import tempfile
nltk.download('punkt')
os.environ["OPENAI_API_KEY"] = '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'
openai.api_key = os.getenv("OPENAI_API_KEY")
def clean_webvtt(filepath: str) -> str:
"""Clean up the content of a subtitle file (vtt) to a string
Args:
filepath (str): path to vtt file
Returns:
str: clean content
"""
# read file content
with open(filepath, "r", encoding="utf-8") as fp:
content = fp.read()
# remove header & empty lines
lines = [line.strip() for line in content.split("\n") if line.strip()]
lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines
# remove indexes
lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]
# remove tcode
#pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
lines = [lines[i] for i in range(len(lines))
if not re.match(pattern, lines[i])]
# remove timestamps
pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
lines = [lines[i] for i in range(len(lines))
if not re.match(pattern, lines[i])]
content = " ".join(lines)
# remove duplicate spaces
pattern = r"\s+"
content = re.sub(pattern, r" ", content)
# add space after punctuation marks if it doesn't exist
pattern = r"([\.!?])(\w)"
content = re.sub(pattern, r"\1 \2", content)
return content
def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
"""Save clean content of a subtitle file to text file
Args:
file_in (str): path to vtt file
file_out (None, optional): path to text file
**kwargs (optional): arguments for other parameters
- no_message (bool): do not show message of result.
Default is False
Returns:
str: path to text file
"""
# set default values
no_message = kwargs.get("no_message", False)
if not file_out:
filename = splitext(file_in)[0]
file_out = "%s.txt" % filename
i = 0
while exists(file_out):
i += 1
file_out = "%s_%s.txt" % (filename, i)
content = clean_webvtt(file_in)
with open(file_out, "w+", encoding="utf-8") as fp:
fp.write(content)
if not no_message:
print("clean content is written to file: %s" % file_out)
return file_out
def get_summary(filepath):
filepath = filepath
vtt_to_clean_file(filepath)
def count_tokens(filename):
with open(filename, 'r') as f:
text = f.read()
tokens = word_tokenize(text)
return len(tokens)
def break_up_file(tokens, chunk_size, overlap_size):
if len(tokens) <= chunk_size:
yield tokens
else:
chunk = tokens[:chunk_size]
yield chunk
yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)
def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
with open(filename, 'r') as f:
text = f.read()
tokens = word_tokenize(text)
return list(break_up_file(tokens, chunk_size, overlap_size))
def convert_to_prompt_text(tokenized_text):
prompt_text = " ".join(tokenized_text)
prompt_text = prompt_text.replace(" 's", "'s")
return prompt_text
def markdown_to_docx(md_text, output_file):
# Convert the Markdown text to HTML
html_text = markdown.markdown(md_text)
# Create a new Document object
doc = Document()
# Parse the HTML and add its content to the .docx document
for p in html_text.split('</p>'):
if '<p>' in p:
clean_p = p.replace('<p>', '').strip()
if clean_p:
doc.add_paragraph(clean_p)
# Save the document to the specified file
doc.save(output_file)
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
def summarize_meeting(filepath):
filename = filepath
token_count = count_tokens(filename)
prompt_response = []
# Break the text of the meeting transcripts into chunks of 4000 tokens.
chunks = break_up_file_to_chunks(filename)
# Summarize each chunk.
for i, chunk in enumerate(chunks):
prompt_request = convert_to_prompt_text(chunks[i])
messages = [
{"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
messages.append({"role": "user", "content": prompt_request})
response = openai.ChatCompletion.create(
model="gpt-4",
messages=messages,
temperature=.4,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
prompt_response.append(
response["choices"][0]["message"]['content'].strip())
# Consolidate these meeting summaries.
prompt_request = "Consolidate these meeting summaries: " + \
str(prompt_response)
# Summarize the text of the meeting transcripts.
messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
messages.append({"role": "user", "content": prompt_request})
response = openai.ChatCompletion.create(
model="gpt-4",
messages=messages,
temperature=.4,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
summary_text = response["choices"][0]["message"]['content'].strip()
#outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
# Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
#markdown_to_docx(
# summary_text, outfilepath)
return summary_text
def summarize_meeting_vtt(file):
temp_file_path = file.name
summary_text = summarize_meeting(temp_file_path)
return summary_text
demo = gr.Interface(
fn=summarize_meeting_vtt,
# input
inputs=gr.File(label="Archivo .vtt"),
# output
outputs=[
gr.Markdown(label="Resumen de la reunión")
],
title="Diminuteevo - Ayudante para Minutas",
description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.")
if __name__ == "__main__":
demo.launch()