|
import os |
|
import openai |
|
import re |
|
from os.path import splitext, exists |
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
import gradio as gr |
|
import backoff |
|
import markdown |
|
from docx import Document |
|
from io import StringIO |
|
from datetime import datetime |
|
import tempfile |
|
|
|
|
|
nltk.download('punkt') |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
|
|
def clean_webvtt(filepath: str) -> str: |
|
"""Clean up the content of a subtitle file (vtt) to a string |
|
|
|
Args: |
|
filepath (str): path to vtt file |
|
|
|
Returns: |
|
str: clean content |
|
""" |
|
|
|
with open(filepath, "r", encoding="utf-8") as fp: |
|
content = fp.read() |
|
|
|
|
|
lines = [line.strip() for line in content.split("\n") if line.strip()] |
|
lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines |
|
|
|
|
|
lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()] |
|
|
|
|
|
|
|
pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d' |
|
lines = [lines[i] for i in range(len(lines)) |
|
if not re.match(pattern, lines[i])] |
|
|
|
|
|
pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$" |
|
lines = [lines[i] for i in range(len(lines)) |
|
if not re.match(pattern, lines[i])] |
|
|
|
content = " ".join(lines) |
|
|
|
|
|
pattern = r"\s+" |
|
content = re.sub(pattern, r" ", content) |
|
|
|
|
|
pattern = r"([\.!?])(\w)" |
|
content = re.sub(pattern, r"\1 \2", content) |
|
|
|
return content |
|
|
|
|
|
def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str: |
|
"""Save clean content of a subtitle file to text file |
|
|
|
Args: |
|
file_in (str): path to vtt file |
|
file_out (None, optional): path to text file |
|
**kwargs (optional): arguments for other parameters |
|
- no_message (bool): do not show message of result. |
|
Default is False |
|
|
|
Returns: |
|
str: path to text file |
|
""" |
|
|
|
no_message = kwargs.get("no_message", False) |
|
if not file_out: |
|
filename = splitext(file_in)[0] |
|
file_out = "%s.txt" % filename |
|
i = 0 |
|
while exists(file_out): |
|
i += 1 |
|
file_out = "%s_%s.txt" % (filename, i) |
|
|
|
content = clean_webvtt(file_in) |
|
with open(file_out, "w+", encoding="utf-8") as fp: |
|
fp.write(content) |
|
if not no_message: |
|
print("clean content is written to file: %s" % file_out) |
|
|
|
return file_out |
|
|
|
|
|
def break_up_file(tokens, chunk_size, overlap_size): |
|
if len(tokens) <= chunk_size: |
|
yield tokens |
|
else: |
|
chunk = tokens[:chunk_size] |
|
yield chunk |
|
yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size) |
|
|
|
|
|
def break_up_file_to_chunks(filename, chunk_size=3000, overlap_size=100): |
|
with open(filename, 'r') as f: |
|
text = f.read() |
|
|
|
tokens = word_tokenize(text) |
|
return list(break_up_file(tokens, chunk_size, overlap_size)) |
|
|
|
|
|
def convert_to_prompt_text(tokenized_text): |
|
|
|
tokenized_text = [x for x in tokenized_text if not any(c.isdigit() for c in x)] |
|
prompt_text = " ".join(tokenized_text) |
|
prompt_text = prompt_text.replace(" 's", "'s") |
|
return prompt_text |
|
|
|
|
|
@backoff.on_exception(backoff.expo, openai.error.RateLimitError) |
|
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError) |
|
def summarize_meeting(filepath): |
|
filename = filepath |
|
print(filepath) |
|
prompt_response = [] |
|
|
|
chunks = break_up_file_to_chunks(filename) |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
print(i) |
|
print(chunk) |
|
prompt_request = convert_to_prompt_text(chunk) |
|
print(prompt_request) |
|
prompt_request = "Resume brevemente esta transcripción de la reunión en el mismo idioma que la entrada del usuario: " + prompt_request |
|
|
|
response = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "user", "content": prompt_request} |
|
], |
|
temperature=.3 |
|
) |
|
|
|
prompt_response.append(response["choices"][0]["message"]['content'].strip()) |
|
|
|
|
|
consolidated_summary = [] |
|
for summary in prompt_response: |
|
prompt_request = "Resume el siguiente texto: " + summary |
|
response = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", |
|
messages=[ |
|
{"role": "user", "content": prompt_request} |
|
], |
|
temperature=.1, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
consolidated_summary.append(response["choices"][0]["message"]['content'].strip()) |
|
|
|
|
|
final_summary_request = " ".join(consolidated_summary) |
|
|
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=[{"role": "system", "content": "Consolidar y resumir el texto de las transcripciones de la reunión. El formato de salida debe ser markdown en el mismo idioma que la entrada del usuario. Comenzar con un resumen breve de la reunión, continuar con puntos destacados que describan los aspectos más importantes de la discusión. Finalmente, proporcionar una tabla para mostrar la lista de acciones con 3 columnas: Acción, Persona Asignada, Fecha de Vencimiento."}, |
|
{"role": "user", "content": final_summary_request} |
|
], |
|
temperature=.1, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
final_summary = response["choices"][0]["message"]['content'].strip() |
|
|
|
|
|
return final_summary |
|
|
|
|
|
def summarize_meeting_vtt(file): |
|
temp_file_path = file.name |
|
summary_text = summarize_meeting(temp_file_path) |
|
|
|
return summary_text |
|
|
|
|
|
demo = gr.Interface( |
|
fn=summarize_meeting_vtt, |
|
|
|
inputs=gr.File(label="Archivo .vtt"), |
|
|
|
outputs=[ |
|
gr.Markdown(label="Resumen de la reunión") |
|
], |
|
title="Minuteevo - Ayudante para Minutas", |
|
description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.") |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|