File size: 6,643 Bytes
1b4e9c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dea0c59
1b4e9c9
 
c572b2e
1b4e9c9
 
 
 
 
c572b2e
 
1b4e9c9
 
 
 
 
 
 
 
 
a0e1665
1b4e9c9
167c11f
1b4e9c9
 
c572b2e
1b4e9c9
c572b2e
 
 
 
 
1b4e9c9
c572b2e
 
 
 
 
 
 
1b4e9c9
c572b2e
 
 
 
 
 
1b4e9c9
c572b2e
 
 
 
 
1b4e9c9
 
 
 
 
c572b2e
1b4e9c9
c572b2e
 
1b4e9c9
1b52241
 
c572b2e
 
 
 
1b52241
 
 
 
 
c572b2e
1b4e9c9
c572b2e
 
1b4e9c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20029ea
f70d91e
1b4e9c9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import os
import openai
import re
from os.path import splitext, exists
import nltk
from nltk.tokenize import word_tokenize
import gradio as gr
import backoff
import markdown
from docx import Document
from io import StringIO
from datetime import datetime
import tempfile


nltk.download('punkt')
openai.api_key = os.getenv("OPENAI_API_KEY")


def clean_webvtt(filepath: str) -> str:
    """Clean up the content of a subtitle file (vtt) to a string

    Args:
        filepath (str): path to vtt file

    Returns:
        str: clean content
    """
    # read file content
    with open(filepath, "r", encoding="utf-8") as fp:
        content = fp.read()

    # remove header & empty lines
    lines = [line.strip() for line in content.split("\n") if line.strip()]
    lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines

    # remove indexes
    lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]

    # remove tcode
    #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
    pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
    lines = [lines[i] for i in range(len(lines))
             if not re.match(pattern, lines[i])]

    # remove timestamps
    pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
    lines = [lines[i] for i in range(len(lines))
             if not re.match(pattern, lines[i])]

    content = " ".join(lines)

    # remove duplicate spaces
    pattern = r"\s+"
    content = re.sub(pattern, r" ", content)

    # add space after punctuation marks if it doesn't exist
    pattern = r"([\.!?])(\w)"
    content = re.sub(pattern, r"\1 \2", content)

    return content


def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
    """Save clean content of a subtitle file to text file

    Args:
        file_in (str): path to vtt file
        file_out (None, optional): path to text file
        **kwargs (optional): arguments for other parameters
            - no_message (bool): do not show message of result.
                                 Default is False

    Returns:
        str: path to text file
    """
    # set default values
    no_message = kwargs.get("no_message", False)
    if not file_out:
        filename = splitext(file_in)[0]
        file_out = "%s.txt" % filename
        i = 0
        while exists(file_out):
            i += 1
            file_out = "%s_%s.txt" % (filename, i)

    content = clean_webvtt(file_in)
    with open(file_out, "w+", encoding="utf-8") as fp:
        fp.write(content)
    if not no_message:
        print("clean content is written to file: %s" % file_out)

    return file_out


def break_up_file(tokens, chunk_size, overlap_size):
    if len(tokens) <= chunk_size:
        yield tokens
    else:
        chunk = tokens[:chunk_size]
        yield chunk
        yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)


def break_up_file_to_chunks(filename, chunk_size=3000, overlap_size=100):
    with open(filename, 'r') as f:
        text = f.read()
        
    tokens = word_tokenize(text)
    return list(break_up_file(tokens, chunk_size, overlap_size))


def convert_to_prompt_text(tokenized_text):
    #elimina de la lista los elementos de los strings que tengan al menos 3 números en cualquier lugar del string
    tokenized_text = [x for x in tokenized_text if not any(c.isdigit() for c in x)]
    prompt_text = " ".join(tokenized_text)
    prompt_text = prompt_text.replace(" 's", "'s")
    return prompt_text


@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
def summarize_meeting(filepath):
    filename = filepath
    print(filepath)
    prompt_response = []
    # Break the text of the meeting transcripts into chunks.
    chunks = break_up_file_to_chunks(filename)
    # Summarize each chunk.
    # Resumir cada fragmento.
    for i, chunk in enumerate(chunks):
        print(i)
        print(chunk)
        prompt_request = convert_to_prompt_text(chunk)
        print(prompt_request)
        prompt_request = "Resume brevemente esta transcripción de la reunión en el mismo idioma que la entrada del usuario: " + prompt_request

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt_request}
            ],
            temperature=.3
        )

        prompt_response.append(response["choices"][0]["message"]['content'].strip())

    # Consolidar estos resúmenes de la reunión.
    consolidated_summary = []
    for summary in prompt_response:
        prompt_request = "Resume el siguiente texto: " + summary
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt_request}
            ],
            temperature=.1,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

        consolidated_summary.append(response["choices"][0]["message"]['content'].strip())

    # Consolidar el resumen usando GPT-4
    final_summary_request = " ".join(consolidated_summary)

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "Consolidar y resumir el texto de las transcripciones de la reunión. El formato de salida debe ser markdown en el mismo idioma que la entrada del usuario. Comenzar con un resumen breve de la reunión, continuar con puntos destacados que describan los aspectos más importantes de la discusión. Finalmente, proporcionar una tabla para mostrar la lista de acciones con 3 columnas: Acción, Persona Asignada, Fecha de Vencimiento."},
            {"role": "user", "content": final_summary_request}
        ],
        temperature=.1,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    final_summary = response["choices"][0]["message"]['content'].strip()


    return final_summary


def summarize_meeting_vtt(file):
    temp_file_path = file.name
    summary_text = summarize_meeting(temp_file_path)

    return summary_text


demo = gr.Interface(
    fn=summarize_meeting_vtt,
    # input
    inputs=gr.File(label="Archivo .vtt"),
    # output
    outputs=[
        gr.Markdown(label="Resumen de la reunión")
    ],
    title="Minuteevo - Ayudante para Minutas",
    description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.")


if __name__ == "__main__":
    demo.launch()