|
import os |
|
import openai |
|
import re |
|
from os.path import splitext, exists |
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
import gradio as gr |
|
import backoff |
|
import markdown |
|
from docx import Document |
|
from io import StringIO |
|
from datetime import datetime |
|
import tempfile |
|
|
|
|
|
nltk.download('punkt') |
|
os.environ["OPENAI_API_KEY"] = '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' |
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
|
|
def clean_webvtt(filepath: str) -> str: |
|
"""Clean up the content of a subtitle file (vtt) to a string |
|
|
|
Args: |
|
filepath (str): path to vtt file |
|
|
|
Returns: |
|
str: clean content |
|
""" |
|
|
|
with open(filepath, "r", encoding="utf-8") as fp: |
|
content = fp.read() |
|
|
|
|
|
lines = [line.strip() for line in content.split("\n") if line.strip()] |
|
lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines |
|
|
|
|
|
lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()] |
|
|
|
|
|
|
|
pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d' |
|
lines = [lines[i] for i in range(len(lines)) |
|
if not re.match(pattern, lines[i])] |
|
|
|
|
|
pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$" |
|
lines = [lines[i] for i in range(len(lines)) |
|
if not re.match(pattern, lines[i])] |
|
|
|
content = " ".join(lines) |
|
|
|
|
|
pattern = r"\s+" |
|
content = re.sub(pattern, r" ", content) |
|
|
|
|
|
pattern = r"([\.!?])(\w)" |
|
content = re.sub(pattern, r"\1 \2", content) |
|
|
|
return content |
|
|
|
|
|
def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str: |
|
"""Save clean content of a subtitle file to text file |
|
|
|
Args: |
|
file_in (str): path to vtt file |
|
file_out (None, optional): path to text file |
|
**kwargs (optional): arguments for other parameters |
|
- no_message (bool): do not show message of result. |
|
Default is False |
|
|
|
Returns: |
|
str: path to text file |
|
""" |
|
|
|
no_message = kwargs.get("no_message", False) |
|
if not file_out: |
|
filename = splitext(file_in)[0] |
|
file_out = "%s.txt" % filename |
|
i = 0 |
|
while exists(file_out): |
|
i += 1 |
|
file_out = "%s_%s.txt" % (filename, i) |
|
|
|
content = clean_webvtt(file_in) |
|
with open(file_out, "w+", encoding="utf-8") as fp: |
|
fp.write(content) |
|
if not no_message: |
|
print("clean content is written to file: %s" % file_out) |
|
|
|
return file_out |
|
|
|
|
|
def get_summary(filepath): |
|
filepath = filepath |
|
vtt_to_clean_file(filepath) |
|
|
|
|
|
def count_tokens(filename): |
|
with open(filename, 'r') as f: |
|
text = f.read() |
|
tokens = word_tokenize(text) |
|
return len(tokens) |
|
|
|
|
|
def break_up_file(tokens, chunk_size, overlap_size): |
|
if len(tokens) <= chunk_size: |
|
yield tokens |
|
else: |
|
chunk = tokens[:chunk_size] |
|
yield chunk |
|
yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size) |
|
|
|
|
|
def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100): |
|
with open(filename, 'r') as f: |
|
text = f.read() |
|
tokens = word_tokenize(text) |
|
return list(break_up_file(tokens, chunk_size, overlap_size)) |
|
|
|
|
|
def convert_to_prompt_text(tokenized_text): |
|
prompt_text = " ".join(tokenized_text) |
|
prompt_text = prompt_text.replace(" 's", "'s") |
|
return prompt_text |
|
|
|
|
|
def markdown_to_docx(md_text, output_file): |
|
|
|
html_text = markdown.markdown(md_text) |
|
|
|
|
|
doc = Document() |
|
|
|
|
|
for p in html_text.split('</p>'): |
|
if '<p>' in p: |
|
clean_p = p.replace('<p>', '').strip() |
|
if clean_p: |
|
doc.add_paragraph(clean_p) |
|
|
|
|
|
doc.save(output_file) |
|
|
|
|
|
@backoff.on_exception(backoff.expo, openai.error.RateLimitError) |
|
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError) |
|
def summarize_meeting(filepath): |
|
filename = filepath |
|
token_count = count_tokens(filename) |
|
|
|
prompt_response = [] |
|
|
|
chunks = break_up_file_to_chunks(filename) |
|
|
|
for i, chunk in enumerate(chunks): |
|
prompt_request = convert_to_prompt_text(chunks[i]) |
|
|
|
messages = [ |
|
{"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}] |
|
messages.append({"role": "user", "content": prompt_request}) |
|
|
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=messages, |
|
temperature=.4, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
prompt_response.append( |
|
response["choices"][0]["message"]['content'].strip()) |
|
|
|
|
|
prompt_request = "Consolidate these meeting summaries: " + \ |
|
str(prompt_response) |
|
|
|
|
|
messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}] |
|
messages.append({"role": "user", "content": prompt_request}) |
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=messages, |
|
temperature=.4, |
|
top_p=1, |
|
frequency_penalty=0, |
|
presence_penalty=0 |
|
) |
|
|
|
summary_text = response["choices"][0]["message"]['content'].strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
return summary_text |
|
|
|
|
|
def summarize_meeting_vtt(file): |
|
temp_file_path = file.name |
|
summary_text = summarize_meeting(temp_file_path) |
|
|
|
return summary_text |
|
|
|
|
|
demo = gr.Interface( |
|
fn=summarize_meeting_vtt, |
|
|
|
inputs=gr.File(label="Archivo .vtt"), |
|
|
|
outputs=[ |
|
gr.Markdown(label="Resumen de la reunión") |
|
], |
|
title="Diminuteevo - Ayudante para Minutas", |
|
description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.") |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|