import json import os import time import numpy as np import openai import pandas as pd import spacy import tqdm from tqdm import tqdm from .utils import get_num_tokens, parse_prompt, num_tokens_from_messages, clean_slides, slide_generation_ver2, generate_latex_slide nlp = spacy.load('en_core_web_sm') def set_openai_api_key(key: str): openai.api_key = 'key' def generate_slide(json_pth: str): model_list = [model['id'] for model in openai.Model.list()['data']] gpt4_id = "gpt-4-0314" gpt3_id = 'gpt-3.5-turbo-0301' with open(json_pth) as f: data = json.load(f) title = data['title'] abstract = data['abstract'] paper_length = len(data['text']) sections = [[head['section'], ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))])] for head in data['headers']] figures = [fig['caption'] for fig in data['figures']] ### ! Split the sections by chunks with token_limit new_sections = [] toc = "" token_limit = 1400 for section in sections: section_title = section[0] curr_count = get_num_tokens(section[1]) toc += section_title + "; " if curr_count > token_limit: # split the section into sentences sents = nlp(section[1]).sents temp_list = [] for sent in sents: if not temp_list: temp_list.append(sent.text) continue curr_count = get_num_tokens(temp_list[-1]) if curr_count + get_num_tokens(sent.text) < token_limit: temp_list[-1] += sent.text else: temp_list.append(sent.text) for i in range(len(temp_list)): if i == 0: new_sections.append([section_title, temp_list[i]]) else: new_sections.append([section_title + " (cont.)", temp_list[i]]) else: new_sections.append(section) print(f"Total number of sections: {len(new_sections)}") # ! get the initial message initial_user_message = "Title: " + title + "\nTable of Contents: " + toc + "\nAbstract: " + abstract initial_section_title = new_sections[0][0] initial_section_content = new_sections[0][1] # ! initial dialogue, Generates slides for the first section of the research paper. res = [] data = [initial_user_message, initial_section_title, initial_section_content] messages = parse_prompt("./dialogue_1.txt", data) token_length = num_tokens_from_messages(messages) assert token_length < 2400, f"Message is too long: {token_length}" response = openai.ChatCompletion.create( model=gpt3_id, messages=messages, temperature=0.5, ) answer = response["choices"][0]["message"]["content"] res.append(answer) time.sleep(10) ### ! Following dialogue. Generates slides for the following sections of the research paper. for i, (section_title, section_content) in enumerate(new_sections[1:]): print(f"Section {i+1}: {section_title} is being processed...") data = [section_content] messages = parse_prompt("./dialogue_2.txt", data) token_length = num_tokens_from_messages(messages) assert token_length < 2400, f"Message is too long: {token_length}" response = openai.ChatCompletion.create( model=gpt3_id, messages=messages, temperature=0.9, ) answer = response["choices"][0]["message"]["content"] res.append(answer) del messages, token_length, response, answer time.sleep(10) # sleep for 10 seconds to avoid API limit ### ! Clean slides from comments, empty lines and other garbage for i in range(len(res)): res[i] = clean_slides(res[i]) temp_res = res prev_cnt = len(temp_res) while len(temp_res) > 1: temp_num_tokens = get_num_tokens("\n".join(temp_res)) temp_res = slide_generation_ver2(temp_res, 1800) print(f"The length of res is {len(temp_res)}, and the number of tokens is {temp_num_tokens}") # if the number of slides is not changed then break if len(temp_res) == prev_cnt: break else: prev_cnt = len(temp_res) # if the number of tokens is less than 4000 then break if temp_num_tokens <= 4000: break new_res = [] for i in tqdm(range(len(temp_res))): data = [temp_res[i]] messages = parse_prompt("./dialogue_3.txt", data) token_length = num_tokens_from_messages(messages) assert token_length < 2400, f"Message is too long: {token_length}" response = openai.ChatCompletion.create( model=gpt3_id, messages=messages, temperature=0.9, ) temp = response["choices"][0]["message"]["content"] temp = clean_slides(temp) new_res.append(temp) time.sleep(5) # needed to avoid API limit temp_res = new_res time.sleep(10) # needed to avoid API limit # ! final refinement final_draft = "\n".join(temp_res) data = [final_draft] messages = parse_prompt("./dialogue_4.txt", data) print(num_tokens_from_messages(messages)) response = openai.ChatCompletion.create( model=gpt4_id if gpt4_id in model_list else gpt3_id, messages=messages, temperature=0.5, ) temp = response["choices"][0]["message"]["content"] # generate_latex_slide(temp, "test.tex") return temp