Spaces:

nateevo
/

Minuteevo

Runtime error

File size: 6,976 Bytes

import os
import openai
import re
from os.path import splitext, exists
import nltk
from nltk.tokenize import word_tokenize
import gradio as gr
import backoff
import markdown
from docx import Document
from io import StringIO
from datetime import datetime
import tempfile


nltk.download('punkt')
os.environ["OPENAI_API_KEY"] = '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'

openai.api_key = os.getenv("OPENAI_API_KEY")


def clean_webvtt(filepath: str) -> str:
    """Clean up the content of a subtitle file (vtt) to a string

    Args:
        filepath (str): path to vtt file

    Returns:
        str: clean content
    """
    # read file content
    with open(filepath, "r", encoding="utf-8") as fp:
        content = fp.read()

    # remove header & empty lines
    lines = [line.strip() for line in content.split("\n") if line.strip()]
    lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines

    # remove indexes
    lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]

    # remove tcode
    #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
    pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
    lines = [lines[i] for i in range(len(lines))
             if not re.match(pattern, lines[i])]

    # remove timestamps
    pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
    lines = [lines[i] for i in range(len(lines))
             if not re.match(pattern, lines[i])]

    content = " ".join(lines)

    # remove duplicate spaces
    pattern = r"\s+"
    content = re.sub(pattern, r" ", content)

    # add space after punctuation marks if it doesn't exist
    pattern = r"([\.!?])(\w)"
    content = re.sub(pattern, r"\1 \2", content)

    return content


def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
    """Save clean content of a subtitle file to text file

    Args:
        file_in (str): path to vtt file
        file_out (None, optional): path to text file
        **kwargs (optional): arguments for other parameters
            - no_message (bool): do not show message of result.
                                 Default is False

    Returns:
        str: path to text file
    """
    # set default values
    no_message = kwargs.get("no_message", False)
    if not file_out:
        filename = splitext(file_in)[0]
        file_out = "%s.txt" % filename
        i = 0
        while exists(file_out):
            i += 1
            file_out = "%s_%s.txt" % (filename, i)

    content = clean_webvtt(file_in)
    with open(file_out, "w+", encoding="utf-8") as fp:
        fp.write(content)
    if not no_message:
        print("clean content is written to file: %s" % file_out)

    return file_out


def get_summary(filepath):
    filepath = filepath
    vtt_to_clean_file(filepath)


def count_tokens(filename):
    with open(filename, 'r') as f:
        text = f.read()
    tokens = word_tokenize(text)
    return len(tokens)


def break_up_file(tokens, chunk_size, overlap_size):
    if len(tokens) <= chunk_size:
        yield tokens
    else:
        chunk = tokens[:chunk_size]
        yield chunk
        yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)


def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
    with open(filename, 'r') as f:
        text = f.read()
    tokens = word_tokenize(text)
    return list(break_up_file(tokens, chunk_size, overlap_size))


def convert_to_prompt_text(tokenized_text):
    prompt_text = " ".join(tokenized_text)
    prompt_text = prompt_text.replace(" 's", "'s")
    return prompt_text


def markdown_to_docx(md_text, output_file):
    # Convert the Markdown text to HTML
    html_text = markdown.markdown(md_text)

    # Create a new Document object
    doc = Document()

    # Parse the HTML and add its content to the .docx document
    for p in html_text.split('</p>'):
        if '<p>' in p:
            clean_p = p.replace('<p>', '').strip()
            if clean_p:
                doc.add_paragraph(clean_p)

    # Save the document to the specified file
    doc.save(output_file)


@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
def summarize_meeting(filepath):
    filename = filepath
    token_count = count_tokens(filename)

    prompt_response = []
    # Break the text of the meeting transcripts into chunks of 4000 tokens.
    chunks = break_up_file_to_chunks(filename)
    # Summarize each chunk.
    for i, chunk in enumerate(chunks):
        prompt_request = convert_to_prompt_text(chunks[i])

        messages = [
            {"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
        messages.append({"role": "user", "content": prompt_request})

        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            temperature=.4,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

        prompt_response.append(
            response["choices"][0]["message"]['content'].strip())

    # Consolidate these meeting summaries.
    prompt_request = "Consolidate these meeting summaries: " + \
        str(prompt_response)

    # Summarize the text of the meeting transcripts.
    messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
    messages.append({"role": "user", "content": prompt_request})
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        temperature=.4,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    summary_text = response["choices"][0]["message"]['content'].strip()
    #outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
    # Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
    #markdown_to_docx(
    #    summary_text, outfilepath)
    

    return summary_text


def summarize_meeting_vtt(file):
    temp_file_path = file.name
    summary_text = summarize_meeting(temp_file_path)

    return summary_text


demo = gr.Interface(
    fn=summarize_meeting_vtt,
    # input
    inputs=gr.File(label="Archivo .vtt"),
    # output
    outputs=[
        gr.Markdown(label="Resumen de la reunión")
    ],
    title="Diminuteevo - Ayudante para Minutas",
    description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.")


if __name__ == "__main__":
    demo.launch()