File size: 6,976 Bytes
1b4e9c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f70d91e
 
1b4e9c9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import os
import openai
import re
from os.path import splitext, exists
import nltk
from nltk.tokenize import word_tokenize
import gradio as gr
import backoff
import markdown
from docx import Document
from io import StringIO
from datetime import datetime
import tempfile


nltk.download('punkt')
os.environ["OPENAI_API_KEY"] = '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'

openai.api_key = os.getenv("OPENAI_API_KEY")


def clean_webvtt(filepath: str) -> str:
    """Clean up the content of a subtitle file (vtt) to a string

    Args:
        filepath (str): path to vtt file

    Returns:
        str: clean content
    """
    # read file content
    with open(filepath, "r", encoding="utf-8") as fp:
        content = fp.read()

    # remove header & empty lines
    lines = [line.strip() for line in content.split("\n") if line.strip()]
    lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines

    # remove indexes
    lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]

    # remove tcode
    #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
    pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
    lines = [lines[i] for i in range(len(lines))
             if not re.match(pattern, lines[i])]

    # remove timestamps
    pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
    lines = [lines[i] for i in range(len(lines))
             if not re.match(pattern, lines[i])]

    content = " ".join(lines)

    # remove duplicate spaces
    pattern = r"\s+"
    content = re.sub(pattern, r" ", content)

    # add space after punctuation marks if it doesn't exist
    pattern = r"([\.!?])(\w)"
    content = re.sub(pattern, r"\1 \2", content)

    return content


def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
    """Save clean content of a subtitle file to text file

    Args:
        file_in (str): path to vtt file
        file_out (None, optional): path to text file
        **kwargs (optional): arguments for other parameters
            - no_message (bool): do not show message of result.
                                 Default is False

    Returns:
        str: path to text file
    """
    # set default values
    no_message = kwargs.get("no_message", False)
    if not file_out:
        filename = splitext(file_in)[0]
        file_out = "%s.txt" % filename
        i = 0
        while exists(file_out):
            i += 1
            file_out = "%s_%s.txt" % (filename, i)

    content = clean_webvtt(file_in)
    with open(file_out, "w+", encoding="utf-8") as fp:
        fp.write(content)
    if not no_message:
        print("clean content is written to file: %s" % file_out)

    return file_out


def get_summary(filepath):
    filepath = filepath
    vtt_to_clean_file(filepath)


def count_tokens(filename):
    with open(filename, 'r') as f:
        text = f.read()
    tokens = word_tokenize(text)
    return len(tokens)


def break_up_file(tokens, chunk_size, overlap_size):
    if len(tokens) <= chunk_size:
        yield tokens
    else:
        chunk = tokens[:chunk_size]
        yield chunk
        yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)


def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
    with open(filename, 'r') as f:
        text = f.read()
    tokens = word_tokenize(text)
    return list(break_up_file(tokens, chunk_size, overlap_size))


def convert_to_prompt_text(tokenized_text):
    prompt_text = " ".join(tokenized_text)
    prompt_text = prompt_text.replace(" 's", "'s")
    return prompt_text


def markdown_to_docx(md_text, output_file):
    # Convert the Markdown text to HTML
    html_text = markdown.markdown(md_text)

    # Create a new Document object
    doc = Document()

    # Parse the HTML and add its content to the .docx document
    for p in html_text.split('</p>'):
        if '<p>' in p:
            clean_p = p.replace('<p>', '').strip()
            if clean_p:
                doc.add_paragraph(clean_p)

    # Save the document to the specified file
    doc.save(output_file)


@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
def summarize_meeting(filepath):
    filename = filepath
    token_count = count_tokens(filename)

    prompt_response = []
    # Break the text of the meeting transcripts into chunks of 4000 tokens.
    chunks = break_up_file_to_chunks(filename)
    # Summarize each chunk.
    for i, chunk in enumerate(chunks):
        prompt_request = convert_to_prompt_text(chunks[i])

        messages = [
            {"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
        messages.append({"role": "user", "content": prompt_request})

        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=messages,
            temperature=.4,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

        prompt_response.append(
            response["choices"][0]["message"]['content'].strip())

    # Consolidate these meeting summaries.
    prompt_request = "Consolidate these meeting summaries: " + \
        str(prompt_response)

    # Summarize the text of the meeting transcripts.
    messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
    messages.append({"role": "user", "content": prompt_request})
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        temperature=.4,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    summary_text = response["choices"][0]["message"]['content'].strip()
    #outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
    # Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
    #markdown_to_docx(
    #    summary_text, outfilepath)
    

    return summary_text


def summarize_meeting_vtt(file):
    temp_file_path = file.name
    summary_text = summarize_meeting(temp_file_path)

    return summary_text


demo = gr.Interface(
    fn=summarize_meeting_vtt,
    # input
    inputs=gr.File(label="Archivo .vtt"),
    # output
    outputs=[
        gr.Markdown(label="Resumen de la reunión")
    ],
    title="Diminuteevo - Ayudante para Minutas",
    description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.")


if __name__ == "__main__":
    demo.launch()