Spaces:
Sleeping
Sleeping
File size: 7,580 Bytes
21a2300 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
import tiktoken
from typing import Dict, Tuple, List
def slide_generation(res, num_tokens_limit=1800):
new_res = [res[0]]
for i in range(1, len(res)):
if not res[i]:
continue
prev_cnt = get_num_tokens(new_res[-1])
curr_cnt = get_num_tokens(res[i])
if prev_cnt + curr_cnt < num_tokens_limit:
new_res[-1] += res[i]
else:
new_res.append(res[i])
return new_res
def slide_generation_ver2(res, num_tokens_limit=1800):
text = "\n".join(res).split("[PE]")
text = [(t.strip() + "\n[PE]\n") if t else "" for t in text]
return slide_generation(text, num_tokens_limit=num_tokens_limit)
def parse_prompt(file: str, data: List[str] = None):
roles = []
contents = []
file = open(file, "r")
for line in file.readlines():
# if line is empty or a comment, skip
if "#" in line or not line.strip():
continue
if "[user]" in line:
roles.append("user")
contents.append([])
continue
elif "[assistant]" in line:
roles.append("assistant")
contents.append([])
continue
elif "[system]" in line:
roles.append("system")
contents.append([])
continue
if line.strip():
assert roles, "No role specified"
contents[-1].append(line.strip())
# checking roles
assert roles[0] in ["user", "system"], "First role must be user or system"
for i in range(1, len(roles)):
assert roles[i] in ["user", "assistant"], "Roles must be user or assistant"
assert roles[i] != roles[i - 1], "Roles must alternate between user and assistant"
contents_str = []
for content in contents:
contents_str.append(" ".join(content))
curr_idx = 0
for i in range(len(contents_str)):
tag = f"[data_tag_{curr_idx}]"
# replace \n with newline
contents_str[i] = contents_str[i].replace("\\n", "\n")
if tag in contents_str[i]:
contents_str[i] = contents_str[i].replace(tag, data[curr_idx])
curr_idx += 1
assert curr_idx == len(data), "Not all data tags were replaced"
messages = []
for i in range(len(roles)):
messages.append({"role": roles[i], "content": contents_str[i]})
return messages
def clean_slides(slide):
slide_list = slide.split('\n')
clean_slide_list = []
for line in slide_list:
if line[:3] == '[F]' or line[:3] == '[T]' or line[:6] == '[T][T]' or line[:4] == '[PB]' or line[:4] == '[PE]':
clean_slide_list.append(line)
return '\n'.join(clean_slide_list)
def generate_latex_slide(slide, output_path=None):
# Initialize the Beamer document
latex_code = "\\documentclass{beamer} \n\\begin{document}"
# Split the slide string into pages
pages = slide.split('[PB]')[1:]
# Iterate through each page
for i, page in enumerate(pages):
tmp_list = [None, None] # [title, content]
page = page.strip()
print(i, page)
# Extract the page title and content
title_end_index = page.index("\n") + 1
title = page[:title_end_index].strip()
content_end_index = page.index("[PE]")
content = page[title_end_index:content_end_index].strip()
# Start a new frame with the page title
if title:
tmp_list[0] = f"\n\\begin{{frame}}{{{title}}}\n\n"
# Split the content into list items
items = content.split('\n')
p = []
for item in items:
if not item:
break
# print(item)
if '[T][T]' in item:
assert len(p) > 0, "Subpoint cannot be the first item in a page"
subpoints = item.split('[T][T]')[1]
p[-1].append(subpoints)
else:
if '[T]' in item:
point = item.split('[T]')[1]
else:
point = item
p.append([point])
if p:
# Add each item as a Beamer itemize element
tmp_list[1] = "\\begin{itemize}\n"
for point in p:
if not point:
break
tmp_list[1] += f"\\item {point[0]}\n"
if len(point) > 1:
tmp_list[1] += "\\begin{itemize}\n"
for subpoint in point[1:]:
tmp_list[1] += f"\\item {subpoint}\n"
tmp_list[1] += "\\end{itemize}\n"
tmp_list[1] += "\\end{itemize}\n"
if tmp_list[0] is None and tmp_list[1] is None:
# The page is empty, so skip it
if i == len(pages) - 1:
# This is the last page, so end the document instead of the frame
latex_code += "\n\\end{document}"
break
tmp_list[1] += "\n\\end{frame}\n"
# End the frame
if i == len(pages) - 1:
# This is the last page, so end the document instead of the frame
tmp_list[1] += "\n\\end{document}"
latex_code += "".join(tmp_list)
latex_code = latex_code.replace('_', '\_').replace('&', '\&').replace('^', '\^').replace('$', '\$')
if output_path:
with open(output_path, 'w') as f:
f.write(latex_code)
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
"""
Returns the number of tokens required to encode the given messages.
source: https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/chatgpt?pivots=programming-language-chat-completions#managing-conversations
"""
encoding = tiktoken.encoding_for_model(model)
num_tokens = 0
for message in messages:
num_tokens += 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
def get_num_tokens(message, model="gpt-3.5-turbo-0301"):
encoding = tiktoken.encoding_for_model(model)
num_tokens = 0
num_tokens += len(encoding.encode(message))
return num_tokens
def get_paper_text_in_chunks(example, chunk_size=4000):
paper_length = len(example['paper']['text'])
title = '[TB] ' + example['title'] + ' [TE] '
abstract = '[AB] ' + example['paper']['abstract'] + ' [AE] '
sections = [' [SB] ' + head['n'] + ' ' + head['section'] + ' [SC] ' + ' '.join([example['paper']['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]) + ' [SE] ' for head in example['paper']['headers']]
figures = [' [FB] ' + fig['caption'] + ' [FE] ' for fig in example['paper']['figures']]
chunks = []
temp_chunk = title + abstract
temp_chunk_length = get_num_tokens(temp_chunk)
for s in sections + figures:
assert get_num_tokens(s) < chunk_size, "Section or figure is too long to fit in a chunk"
if temp_chunk_length + get_num_tokens(s) > chunk_size:
chunks.append(temp_chunk)
temp_chunk = s
temp_chunk_length = get_num_tokens(s)
else:
temp_chunk += s
temp_chunk_length += get_num_tokens(s)
if temp_chunk_length > 0:
chunks.append(temp_chunk)
return chunks
|