Spaces:
Sleeping
Sleeping
import json | |
import os | |
import time | |
import numpy as np | |
import openai | |
import pandas as pd | |
import spacy | |
import tqdm | |
from tqdm import tqdm | |
from .utils import get_num_tokens, parse_prompt, num_tokens_from_messages, clean_slides, slide_generation_ver2, generate_latex_slide | |
nlp = spacy.load('en_core_web_sm') | |
def set_openai_api_key(key: str): | |
openai.api_key = 'key' | |
def generate_slide(json_pth: str): | |
model_list = [model['id'] for model in openai.Model.list()['data']] | |
gpt4_id = "gpt-4-0314" | |
gpt3_id = 'gpt-3.5-turbo-0301' | |
with open(json_pth) as f: | |
data = json.load(f) | |
title = data['title'] | |
abstract = data['abstract'] | |
paper_length = len(data['text']) | |
sections = [[head['section'], ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))])] for head in data['headers']] | |
figures = [fig['caption'] for fig in data['figures']] | |
### ! Split the sections by chunks with token_limit | |
new_sections = [] | |
toc = "" | |
token_limit = 1400 | |
for section in sections: | |
section_title = section[0] | |
curr_count = get_num_tokens(section[1]) | |
toc += section_title + "; " | |
if curr_count > token_limit: | |
# split the section into sentences | |
sents = nlp(section[1]).sents | |
temp_list = [] | |
for sent in sents: | |
if not temp_list: | |
temp_list.append(sent.text) | |
continue | |
curr_count = get_num_tokens(temp_list[-1]) | |
if curr_count + get_num_tokens(sent.text) < token_limit: | |
temp_list[-1] += sent.text | |
else: | |
temp_list.append(sent.text) | |
for i in range(len(temp_list)): | |
if i == 0: | |
new_sections.append([section_title, temp_list[i]]) | |
else: | |
new_sections.append([section_title + " (cont.)", temp_list[i]]) | |
else: | |
new_sections.append(section) | |
print(f"Total number of sections: {len(new_sections)}") | |
# ! get the initial message | |
initial_user_message = "Title: " + title + "\nTable of Contents: " + toc + "\nAbstract: " + abstract | |
initial_section_title = new_sections[0][0] | |
initial_section_content = new_sections[0][1] | |
# ! initial dialogue, Generates slides for the first section of the research paper. | |
res = [] | |
data = [initial_user_message, initial_section_title, initial_section_content] | |
messages = parse_prompt("./dialogue_1.txt", data) | |
token_length = num_tokens_from_messages(messages) | |
assert token_length < 2400, f"Message is too long: {token_length}" | |
response = openai.ChatCompletion.create( | |
model=gpt3_id, | |
messages=messages, | |
temperature=0.5, | |
) | |
answer = response["choices"][0]["message"]["content"] | |
res.append(answer) | |
time.sleep(10) | |
### ! Following dialogue. Generates slides for the following sections of the research paper. | |
for i, (section_title, section_content) in enumerate(new_sections[1:]): | |
print(f"Section {i+1}: {section_title} is being processed...") | |
data = [section_content] | |
messages = parse_prompt("./dialogue_2.txt", data) | |
token_length = num_tokens_from_messages(messages) | |
assert token_length < 2400, f"Message is too long: {token_length}" | |
response = openai.ChatCompletion.create( | |
model=gpt3_id, | |
messages=messages, | |
temperature=0.9, | |
) | |
answer = response["choices"][0]["message"]["content"] | |
res.append(answer) | |
del messages, token_length, response, answer | |
time.sleep(10) # sleep for 10 seconds to avoid API limit | |
### ! Clean slides from comments, empty lines and other garbage | |
for i in range(len(res)): | |
res[i] = clean_slides(res[i]) | |
temp_res = res | |
prev_cnt = len(temp_res) | |
while len(temp_res) > 1: | |
temp_num_tokens = get_num_tokens("\n".join(temp_res)) | |
temp_res = slide_generation_ver2(temp_res, 1800) | |
print(f"The length of res is {len(temp_res)}, and the number of tokens is {temp_num_tokens}") | |
# if the number of slides is not changed then break | |
if len(temp_res) == prev_cnt: | |
break | |
else: | |
prev_cnt = len(temp_res) | |
# if the number of tokens is less than 4000 then break | |
if temp_num_tokens <= 4000: | |
break | |
new_res = [] | |
for i in tqdm(range(len(temp_res))): | |
data = [temp_res[i]] | |
messages = parse_prompt("./dialogue_3.txt", data) | |
token_length = num_tokens_from_messages(messages) | |
assert token_length < 2400, f"Message is too long: {token_length}" | |
response = openai.ChatCompletion.create( | |
model=gpt3_id, | |
messages=messages, | |
temperature=0.9, | |
) | |
temp = response["choices"][0]["message"]["content"] | |
temp = clean_slides(temp) | |
new_res.append(temp) | |
time.sleep(5) # needed to avoid API limit | |
temp_res = new_res | |
time.sleep(10) # needed to avoid API limit | |
# ! final refinement | |
final_draft = "\n".join(temp_res) | |
data = [final_draft] | |
messages = parse_prompt("./dialogue_4.txt", data) | |
print(num_tokens_from_messages(messages)) | |
response = openai.ChatCompletion.create( | |
model=gpt4_id if gpt4_id in model_list else gpt3_id, | |
messages=messages, | |
temperature=0.5, | |
) | |
temp = response["choices"][0]["message"]["content"] | |
# generate_latex_slide(temp, "test.tex") | |
return temp | |