import os import openai import re from os.path import splitext, exists import nltk from nltk.tokenize import word_tokenize import gradio as gr import backoff import markdown from docx import Document from io import StringIO from datetime import datetime import tempfile nltk.download('punkt') os.environ["OPENAI_API_KEY"] = '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' openai.api_key = os.getenv("OPENAI_API_KEY") def clean_webvtt(filepath: str) -> str: """Clean up the content of a subtitle file (vtt) to a string Args: filepath (str): path to vtt file Returns: str: clean content """ # read file content with open(filepath, "r", encoding="utf-8") as fp: content = fp.read() # remove header & empty lines lines = [line.strip() for line in content.split("\n") if line.strip()] lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines # remove indexes lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()] # remove tcode #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}') pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d' lines = [lines[i] for i in range(len(lines)) if not re.match(pattern, lines[i])] # remove timestamps pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$" lines = [lines[i] for i in range(len(lines)) if not re.match(pattern, lines[i])] content = " ".join(lines) # remove duplicate spaces pattern = r"\s+" content = re.sub(pattern, r" ", content) # add space after punctuation marks if it doesn't exist pattern = r"([\.!?])(\w)" content = re.sub(pattern, r"\1 \2", content) return content def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str: """Save clean content of a subtitle file to text file Args: file_in (str): path to vtt file file_out (None, optional): path to text file **kwargs (optional): arguments for other parameters - no_message (bool): do not show message of result. Default is False Returns: str: path to text file """ # set default values no_message = kwargs.get("no_message", False) if not file_out: filename = splitext(file_in)[0] file_out = "%s.txt" % filename i = 0 while exists(file_out): i += 1 file_out = "%s_%s.txt" % (filename, i) content = clean_webvtt(file_in) with open(file_out, "w+", encoding="utf-8") as fp: fp.write(content) if not no_message: print("clean content is written to file: %s" % file_out) return file_out def get_summary(filepath): filepath = filepath vtt_to_clean_file(filepath) def count_tokens(filename): with open(filename, 'r') as f: text = f.read() tokens = word_tokenize(text) return len(tokens) def break_up_file(tokens, chunk_size, overlap_size): if len(tokens) <= chunk_size: yield tokens else: chunk = tokens[:chunk_size] yield chunk yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size) def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100): with open(filename, 'r') as f: text = f.read() tokens = word_tokenize(text) return list(break_up_file(tokens, chunk_size, overlap_size)) def convert_to_prompt_text(tokenized_text): prompt_text = " ".join(tokenized_text) prompt_text = prompt_text.replace(" 's", "'s") return prompt_text def markdown_to_docx(md_text, output_file): # Convert the Markdown text to HTML html_text = markdown.markdown(md_text) # Create a new Document object doc = Document() # Parse the HTML and add its content to the .docx document for p in html_text.split('

'): if '

' in p: clean_p = p.replace('

', '').strip() if clean_p: doc.add_paragraph(clean_p) # Save the document to the specified file doc.save(output_file) @backoff.on_exception(backoff.expo, openai.error.RateLimitError) @backoff.on_exception(backoff.expo, openai.error.APIConnectionError) def summarize_meeting(filepath): filename = filepath token_count = count_tokens(filename) prompt_response = [] # Break the text of the meeting transcripts into chunks of 4000 tokens. chunks = break_up_file_to_chunks(filename) # Summarize each chunk. for i, chunk in enumerate(chunks): prompt_request = convert_to_prompt_text(chunks[i]) messages = [ {"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}] messages.append({"role": "user", "content": prompt_request}) response = openai.ChatCompletion.create( model="gpt-4", messages=messages, temperature=.4, top_p=1, frequency_penalty=0, presence_penalty=0 ) prompt_response.append( response["choices"][0]["message"]['content'].strip()) # Consolidate these meeting summaries. prompt_request = "Consolidate these meeting summaries: " + \ str(prompt_response) # Summarize the text of the meeting transcripts. messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}] messages.append({"role": "user", "content": prompt_request}) response = openai.ChatCompletion.create( model="gpt-4", messages=messages, temperature=.4, top_p=1, frequency_penalty=0, presence_penalty=0 ) summary_text = response["choices"][0]["message"]['content'].strip() #outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx" # Convert the summary to a .docx file with the name "Resumen-Minuta-.docx" #markdown_to_docx( # summary_text, outfilepath) return summary_text def summarize_meeting_vtt(file): temp_file_path = file.name summary_text = summarize_meeting(temp_file_path) return summary_text demo = gr.Interface( fn=summarize_meeting_vtt, # input inputs=gr.File(label="Archivo .vtt"), # output outputs=[ gr.Markdown(label="Resumen de la reunión") ], title="Diminuteevo - Ayudante para Minutas", description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión para que puedas crear tu minuta.") if __name__ == "__main__": demo.launch()