zolicsaki commited on
Commit
ddffe9f
·
verified ·
1 Parent(s): cf18115

Delete paper2slides.py

Browse files
Files changed (1) hide show
  1. paper2slides.py +0 -703
paper2slides.py DELETED
@@ -1,703 +0,0 @@
1
- slide_datasource = {
2
- 'introduction': ['abstract', 'Introduction'],
3
- 'objective': ['abstract', 'Introduction'],
4
- 'methodoloy': ['abstract', 'Introduction', 'Conclusion', 'Methods'],
5
- 'results': ['abstract', 'Experiments', 'Conclusion'],
6
- 'conclusion': ['abstract', 'Introduction', 'Conclusion'],
7
- }
8
-
9
- from pdf_helper import PDFPaper4LLMParser, dict_to_markdown_list
10
- from sambaAPI import call_llama_chat, MODEL_ALIAS
11
- from pdf_helper import markdown_to_slide_dicts
12
- from pptx_utils import Dict2PPT, os
13
- import json
14
- import time
15
- import string
16
-
17
- SLIDE_SEP = '<slide_sep>'
18
-
19
- def trim_string(s):
20
- return s.strip(string.whitespace + string.punctuation)
21
-
22
- section_title_key_phrases = {
23
- 'Introduction': ['introduction'],
24
- 'Related Works': ['related work'],
25
- 'Methods': ['method', 'approach'],
26
- 'Experiments': ['experiment'],
27
- 'Conclusion': ['conclusion'],
28
- 'Acknowledgements': ['acknowledgement'],
29
- 'References': ['references', ' references'], #
30
- }
31
-
32
- def find_string_index(string_list, target: str):
33
- """
34
- Returns the index of the target string in the list.
35
- If the target is not found, returns -1.
36
-
37
- Parameters:
38
- string_list (list): A list of strings
39
- target (str): The string to find in the list
40
-
41
- Returns:
42
- int: The index of the target string, or -1 if not found
43
- """
44
- try:
45
- return string_list.index(target)
46
- except ValueError:
47
- return -1
48
-
49
-
50
- def get_section_category(section_name: str):
51
- """
52
- Scientist paper section name mapping
53
- """
54
- for key, phrases in section_title_key_phrases.items():
55
- for phrase in phrases:
56
- if phrase in section_name.lower():
57
- return key
58
- return 'Other'
59
-
60
-
61
- class PaperReader(object):
62
- def __init__(self, page_chunks=False):
63
- self.paper_reader = PDFPaper4LLMParser(page_chunks=page_chunks)
64
-
65
- def pdf2text(self, paper_pdf_path: str):
66
- paper_content = self.paper_reader.run(pdf_path=paper_pdf_path, verbose=False)
67
- return paper_content
68
-
69
- def structurize(self, main_text_array: list):
70
- section_names = [_['title'] for _ in main_text_array]
71
- section_name_topics = [get_section_category(_) for _ in section_names]
72
- introduction_idx = find_string_index(section_name_topics, target='Introduction')
73
- refference_idx = find_string_index(section_name_topics, target='References')
74
- experiment_idx = find_string_index(section_name_topics, target='Experiments')
75
- conclusion_idx = find_string_index(section_name_topics, target='Conclusion')
76
- if refference_idx > 0:
77
- for idx in range(len(section_name_topics)):
78
- if idx < refference_idx:
79
- if section_name_topics[idx] == 'Other':
80
- section_name_topics[idx] = 'Methods'
81
- elif idx > refference_idx:
82
- if not ('appendix' in section_name_topics[idx].lower()):
83
- section_name_topics[idx] = 'Appendix: ' + section_name_topics[idx]
84
- else:
85
- continue
86
- # print(section_name_topics)
87
- if experiment_idx > 0:
88
- for idx in range(experiment_idx +1, refference_idx):
89
- if section_name_topics[idx] == 'Methods':
90
- section_name_topics[idx] = 'Experiments'
91
- # print(section_name_topics)
92
- experiment_idx = find_string_index(section_name_topics, target='Experiments')
93
- method_idx = find_string_index(section_name_topics, target='Methods')
94
- relatedwork_idx = find_string_index(section_name_topics, target='Related Works')
95
- ack_idx = find_string_index(section_name_topics, target='Acknowledgements')
96
-
97
- paper_structure_dict = {
98
- 'Introduction': [introduction_idx],
99
- 'Related Works': [relatedwork_idx],
100
- 'References': [refference_idx],
101
- 'Conclusion': [conclusion_idx],
102
- 'Acknowledgements': [ack_idx]
103
- }
104
-
105
- ## Experiments and methodology
106
- method_idx_array = []
107
- if method_idx >=0:
108
- for idx in range(method_idx, len(section_name_topics)):
109
- if section_name_topics[idx] == 'Methods':
110
- method_idx_array.append(idx)
111
- else:
112
- break
113
- else:
114
- if introduction_idx >=0 and conclusion_idx >=0:
115
- for idx in range(introduction_idx+1, conclusion_idx):
116
- if section_name_topics[idx] == 'Methods':
117
- method_idx_array.append(idx)
118
- else:
119
- break
120
-
121
-
122
- exp_idx_array = []
123
- if experiment_idx >=0:
124
- for idx in range(experiment_idx, len(section_name_topics)):
125
- if section_name_topics[idx] == 'Experiments':
126
- exp_idx_array.append(idx)
127
- else:
128
- break
129
- else:
130
- if introduction_idx >=0 and conclusion_idx >=0:
131
- for idx in range(introduction_idx+1, conclusion_idx):
132
- if section_name_topics[idx] == 'Experiments':
133
- exp_idx_array.append(idx)
134
- else:
135
- break
136
-
137
- paper_structure_dict['Experiments'] = exp_idx_array
138
- paper_structure_dict['Methods'] = method_idx_array
139
- return section_name_topics, paper_structure_dict
140
-
141
- def run(self, paper_file_name: str):
142
- start_time = time.time()
143
- paper_content = self.pdf2text(paper_pdf_path=paper_file_name)
144
- section_name_topics, paper_structure_dict = self.structurize(main_text_array=paper_content['main_text'])
145
- paper_content['structure'] = paper_structure_dict
146
- paper_content['section_topic'] = section_name_topics
147
- print('Runtime for pdf2text = {:.4f} seconds.'.format(time.time() - start_time))
148
- return paper_content
149
-
150
- ### 1. General System Prompt
151
-
152
- SCHOLAR_PROMPT = """
153
- You are an assistant being skilled at critically reading and analyzing academic papers to extract key insights, trends, and findings.
154
- """
155
-
156
- ### 2. Paper Outline Generation from Abstract
157
-
158
- ABSTRACT_SUMMARY_PROMPT = """
159
- You are given the **title** and **abstract** of an academic paper. Please first identity the research topic, and then extract the following aspects in a minimal title draft (max 15 words) for PowerPoint presentation:
160
-
161
- 1. **Background**: Introduces the research context and importance.
162
- 2. **Research Problem**: Identifies the specific problem or knowledge gap.
163
- 3. **Objectives**: States the research goals or hypotheses.
164
- 4. **Methodology**: Summarizes the research design and key methods.
165
- 5. **Results**: Highlights the most significant findings.
166
- 6. **Conclusions**: Provides the main takeaways and their relation to the research question.
167
-
168
- Reminder: Strictly output in JSON format **only**, using the keys: "Research topic", "Background", "Research problem", "Objectives", "Methodology", "Results" and "Conclusions".
169
- """
170
-
171
- ### 3. Evidence extraction from main paper text for "Background"
172
- BACKGROUD_EVIDENCE_PROMPT = """
173
- You are given the **title**, briefly description of **problem backgroud** and **introduction** of a research paper. From the introduction, extract an itemized list of **1 to 3 pieces of evidence** that support the problem background, each evidence should be described in a **minimal draft (min 10 words and max 25 words)** for PowerPoint presentation.
174
-
175
- Each piece of evidence must:
176
- 1. Be directly relevant to the problem background.
177
- 2. Be clear and concise.
178
- 3. Be unique, not repeating other evidence.
179
-
180
- **Important**: Strictly output the itemized evidences ONLY.
181
- """
182
-
183
-
184
- ### 4. Evidence extraction from main paper text for "Research Problem"
185
- RESEARCH_PROBLEM_PROMPT = """
186
- You are given the **title**, briefly description of **research problem** and **introduction** of a research paper. Solely from the given introduction, extract the definition of the research problem for PowerPoint presentation, focusing on:
187
-
188
- 1. **Scope**: Define the problem’s boundaries as individual items;
189
- 2. **Challenges**: Identify key gaps or obstacles the research addresses as individual items;
190
- 3. **Assumptions**: State any assumptions guiding the research as individual items;
191
- 4. **Relevance*: Specify who benefits from solving the problem as individual items.
192
-
193
- **Note**: Each item must be in one concise sentence. **Only** output "Scope", "Challenges", "Assumptions" and "Relevance".
194
- """
195
-
196
-
197
- ### 5. Evidence extraction from main paper text for "Objectives"
198
-
199
- OBJECTIVE_PROMPT = """
200
- You are given the **title**, **objectives** and **introduction** of a research paper. Solely from the given introduction, extract a list of **2 to 5 pieces of evidence** to support these objectives, each evidence should be described in a **minimal draft (min 10 words and max 20 words)** for PowerPoint presentation.
201
-
202
- Each piece of evidence must:
203
- 1. Be directly relevant to the objectives.
204
- 2. Be clear and concise.
205
- 3. Be unique, not repeating other evidence.
206
-
207
- **Note**: Strictly output the itemized evidences ONLY.
208
- """
209
-
210
- ### 6. Evidence extraction from main paper text for "Conclusion"
211
-
212
- CONCLUSION_PROMT = """
213
- You are given the **title**, **birief conclusion**, and **full text conclusion** and **introduction** of a research paper. From the given conclusion and introduction, extract the **conclusion** for PowerPoint presentation, ensuring it includes:
214
-
215
- 1. **Summary of key results**: Highlight the main results.
216
- 2. **Implications**: Explain the significance or impact of these findings.
217
- 3. **Future directions**: Mention any suggestions for future research or applications.
218
- 4. **Final takeaway**: Provide the overall takeaway message of the study.
219
-
220
- **Note**: Only output the conclusion. Limit each point in a minimal concise draft (at least 10 words).”
221
- """
222
-
223
- ### 7. Evidence extraction from main paper text for "Experimental results" (iterative)
224
-
225
- RESULT_PROMPT_DICT = {
226
- "system_instruction": """Given the title, the main results of an experimental study, and a paragraph from a research paper, your task is to extract and summarize evidence from the paragraph that supports the 'main results'.
227
-
228
- Follow these steps for each paragraph:
229
- 1. **Detect Evidence**: Check if the paragraph contains:
230
- 1) Any evidence supporting the main results, or
231
- 2) Experimental study information, including:
232
- - **Dataset**: Details on datasets, preprocessing, or train/test splits.
233
- - **Model Description**: Information of baselines, hyperparameters, and training.
234
- - **Evaluation Metrics**: Relevant metrics like accuracy, F1 score, and their justification.
235
- - **Comparative Analysis**: Comparisons with baselines, ablation studies, statistical significance.
236
- - **Runtime & Scalability**: Computational complexity and scalability.
237
- 2. **Response**: Choose 'YES' or 'NO':
238
- - If 'YES', extract and summarize the evidence or experimental details in 200 words. Ensure the summary is:
239
- - Clear and concise
240
- - Well-formatted for easy reading
241
- - Focused on key points: dataset, model Description, evaluation metrics, comparative analysis and runtime & scalability.
242
- - If 'NO', just respond with 'NO EVIDENCE'.
243
- """,
244
-
245
- "iterative_prompt": """Summarize the experimental details or evidence supporting the 'main results' in 200 words from the following paragraph (with title and content) if experiment-related information is detected. Follow these instructions:
246
-
247
- 1. List 2 to 4 itemized points.
248
- 2. Each point must specify the type ('Evidence' or 'Experimental Setup') and provide a minimal draft sentence of content (max 15 words).
249
-
250
- **Note**: Only provide the itemized summary.
251
- """,
252
-
253
- "final_prompt": """Using the **title**, the **main results** of an experimental study, and a list of experiment summaries from the research paper, follow these steps to summarize the results:
254
-
255
- 1. **Evidence Summary**: prive a numbered, itemized summary of **2-3** key points. Keep each point brief and focused (only 1 sentence).
256
-
257
- 2. **Experimental Summary**: Based all 'Experimental Setup' points and provide a concise summary covering the following aspects:
258
- 1) **Datasets**: List only the names of all datasets or benchmarks used.
259
- 2) **Baselines**: List only the names of all models/algorithms used.
260
- 3) **Metrics**: List only the evaluation metrics used for model performance, such as accuracy, F1-score, recall, precision, AUC, etc.
261
- 4) **Results**: Summarize key comparisons and ablation results, focusing on the most important details.
262
-
263
- **Note**: Only output the “Evidence Summary” and “Experimental Summary”
264
- """
265
- }
266
-
267
- ## Methodology extraction
268
-
269
- METHOD_PROMPT_DICT = {
270
- "system_instruction": """Given the **title**, the **method overview**, and a paragraph of a research paper. You task is identify and extract text being relevant to 'method overview' from the given paragraph for PowerPoint presentation.
271
-
272
- Follow these steps:
273
- 1. **Method Information Detection**: Check if the paragraph contains:
274
- 1) Any mention of the **method overview** or
275
- 2) Specific method details, such as:
276
- - **Problem Definition**: The task, input, and expected output.
277
- - **Model Architecture**: Structure, key components, and learning type.
278
- - **Algorithm**: Steps of the method.
279
- - **Training Process**: Training data, optimization method, and loss function.
280
- 2. **Response**: Choose 'YES' or 'NO':
281
- - If 'YES', summarize the method details in a minimal draft with max 20 words, ensuring it is:
282
- - Clear and concise
283
- - Well-formatted for readability
284
- - Focused on key points.
285
- - If 'NO', simply respond with 'NO Information'.
286
- """,
287
- "iterative_prompt": """Summarize the method description in 200 words from the following paragraph (with title and content) if method-related information is found. Follow these steps:
288
-
289
- 1. List **2 to 4** method steps in numbered format..
290
- 2. Ensure each step is related to the **method overview**.
291
- 3. Keep each step clear and concise (only minimal draft with max 15 words).
292
-
293
- **Note**: Only output the itemized method steps.
294
- """,
295
-
296
- "final_prompt": """Using **title**, **method overview**, and a list of itemized method step summary from a research paper, follow these instructions to summarize the method description::
297
-
298
- 1. Provide a numbered list of **3-6 method steps** detailing the **method overview**.
299
- 2. Keep each step clear and concise (only 1 sentence).
300
-
301
- **Note**: Only output the itemized method steps.
302
- """
303
- }
304
-
305
- SLIDES_REVISION_PROMPT = """You are an expert research assistant. Revise the following research paper slides to enhance clarity and readability while preserving the original markdown structure. Keep all first-level markdown headers unchanged. Sections are separated by '{}'. Follow these guidelines:
306
-
307
- 1. Simplify language and make content more concise, especially in the outline.
308
- 2. Preserve the logical flow and overall structure.
309
- 3. Make key points and conclusions clear and easy to follow.
310
- 4. Use bullet points where appropriate for better clarity.
311
- 5. Minimize jargon to ensure accessibility for a broad academic audience.
312
-
313
- """.format(SLIDE_SEP)
314
-
315
- def make_api_call(model, messages, max_tokens, temperature):
316
- try:
317
- response = call_llama_chat(messages=messages, model=model, temperature=temperature, max_tokens=max_tokens)
318
- return response
319
- except Exception as e:
320
- return f"Failed to generate final answer. Error: {str(e)}", {}
321
-
322
- def convert_to_dict(input_string: str):
323
- # Split the string by the delimiter (e.g., semicolon)
324
- lines = input_string.strip().split('\n')
325
- # Initialize an empty dictionary
326
- result_dict = {}
327
- # Iterate over each line
328
- for line in lines:
329
- # Split each line into key and value by the delimiter (e.g., colon)
330
- if ':' in line:
331
- key, value = line.split(':', 1) # Split only on the first occurrence
332
- # Strip any whitespace and store in the dictionary
333
- result_dict[key.strip()] = value.strip()
334
- return result_dict
335
-
336
-
337
- class Paper2Slides(object):
338
- def __init__(self, paper_contents: dict, model: str, max_tokens = 512, temprature=0.1):
339
- self.paper_contents = paper_contents
340
- if not self.valid_paper_checking():
341
- print('Not a valid paper structure, cannot generate slides')
342
- exit(1)
343
- self.model = MODEL_ALIAS[model]
344
- self.is_rate_limitation = ('405B' in self.model) or ('70B' in self.model)
345
- self.temprature = temprature
346
- self.max_failure_attempt_each_step = 3
347
- if '405B' in self.model:
348
- self.sleep_time = 0.25
349
- else:
350
- self.sleep_time = 0.25
351
- self.max_tokens = max_tokens
352
- print('{} model is used for slides generation!\nRate limitation = {}'.format(self.model, self.is_rate_limitation))
353
- self.revise_model = MODEL_ALIAS['llama3_70b']
354
-
355
- def valid_paper_checking(self):
356
- try:
357
- assert 'abstract' in self.paper_contents, 'No abstract is detected'
358
- assert 'title' in self.paper_contents, 'No title is detected'
359
- paper_structure = self.paper_contents['structure']
360
- introduction_idx_array = paper_structure['Introduction']
361
- conclusion_idx_array = paper_structure['Conclusion']
362
- assert introduction_idx_array[0] >=0, 'No introduction is detected'
363
- assert conclusion_idx_array[0] >=0, 'No conclusion is detected'
364
- except AssertionError as e:
365
- print(f"AssertionError: {e}")
366
- return False
367
- return True
368
-
369
- def step(self, messages):
370
- result = self.run(messages=messages)
371
- if 'Failed' in result:
372
- time.sleep(self.sleep_time)
373
- if self.is_rate_limitation:
374
- print('sleep {} seconds'.format(self.sleep_time))
375
- time.sleep(self.sleep_time)
376
- return result
377
-
378
- def run(self, messages):
379
- for attempt in range(self.max_failure_attempt_each_step):
380
- try:
381
- response = make_api_call(messages=messages, model=self.model, max_tokens=self.max_tokens, temperature=self.temprature)
382
- return response
383
- except Exception as e:
384
- if attempt == self.max_failure_attempt_each_step - 1:
385
- return "Failed to generate step after {} attempts. $ERROR$: {}".format(self.max_failure_attempt_each_step, str(e))
386
- else:
387
- return "Failed to generate step. $ERROR$: {}".format(str(e))
388
- time.sleep(2) # Wait for 1 second before retrying
389
- return 'Failed to generate reasoning step.'
390
-
391
-
392
- def abstract_summary(self):
393
- """
394
- Extract the outline for the slides from abstract
395
- """
396
- assert len(self.paper_contents['title']) > 0 and len(self.paper_contents['abstract']) > 512
397
- prompt = "**title**: {}\n\n**abstract**: {}".format(self.paper_contents['title'], self.paper_contents['abstract'])
398
- messages = [
399
- {"role": "system", "content": SCHOLAR_PROMPT},
400
- {"role": "system", "content": ABSTRACT_SUMMARY_PROMPT},
401
- {"role": "user", "content": prompt},
402
- {"role": "assistant", "content": "I will extract the evidences following my instructions."}
403
- ]
404
- abstract_summary = self.step(messages=messages)
405
- try:
406
- abstract_summary_dict = json.loads(abstract_summary)
407
- except Exception as e:
408
- abstract_summary_dict = convert_to_dict(input_string=abstract_summary)
409
-
410
- trim_abstract_summary_dict = {}
411
- for k, v in abstract_summary_dict.items():
412
- trim_abstract_summary_dict[trim_string(k)] = v
413
- return trim_abstract_summary_dict
414
-
415
- def support_background(self, background: str, introduction: str):
416
- """
417
- Extract support evidences for background from introduction
418
- """
419
- prompt = "**title**: {}\n\n**promblem background**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], background, introduction)
420
- messages = [
421
- {"role": "system", "content": SCHOLAR_PROMPT},
422
- {"role": "system", "content": BACKGROUD_EVIDENCE_PROMPT},
423
- {"role": "user", "content": prompt},
424
- {"role": "assistant", "content": "I will extract the evidences following my instructions."}
425
- ]
426
- evidences = self.step(messages=messages)
427
- # print('Background evidences = {}'.format(evidences))
428
- step_num = 1
429
- return evidences, step_num
430
-
431
- def support_research_problem(self, research_problem: str, introduction: str):
432
- """
433
- Extract support evidences for research problem from introduction
434
- """
435
- prompt = "**title**: {}\n\n**research problem**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], research_problem, introduction)
436
- messages = [
437
- {"role": "system", "content": SCHOLAR_PROMPT},
438
- {"role": "system", "content": RESEARCH_PROBLEM_PROMPT},
439
- {"role": "user", "content": prompt},
440
- {"role": "assistant", "content": "I will extract the evidences following my instructions."}
441
- ]
442
- evidences = self.step(messages=messages)
443
- step_num = 1
444
- return evidences, step_num
445
-
446
- def support_objectives(self, objectives: str, introduction: str):
447
- """
448
- Extract support evidences for objectives from introduction
449
- """
450
- prompt = "**title**: {}\n\n**objectives**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], objectives, introduction)
451
- messages = [
452
- {"role": "system", "content": SCHOLAR_PROMPT},
453
- {"role": "system", "content": OBJECTIVE_PROMPT},
454
- {"role": "user", "content": prompt},
455
- {"role": "assistant", "content": "I will extract the evidences following my instructions."}
456
- ]
457
- evidences = self.step(messages=messages)
458
- step_num = 1
459
- return evidences, step_num
460
-
461
- def support_conclusion(self, conclusion: str, introduction: str, conclusion_text: str, step_wise=True):
462
- """
463
- Expand conclusion based on full-text conclusion and introducton.
464
- If step_wise = True:
465
- 1. Summarize introduction while focusing on conclusion part
466
- 2. Extract conclusion points from introduction summary and full-context conclusion.
467
- """
468
- step_num = 0
469
- prompt = "**title**: {}\n\n**introduction**: {}".format(self.paper_contents['title'], introduction)
470
- if step_wise:
471
- messages = [
472
- {"role": "system", "content": SCHOLAR_PROMPT},
473
- {"role": "system", "content": "Given a **tititle** and **introduction** of a research paper, summarize and extract conclusion related information in about 200 words."},
474
- {"role": "user", "content": prompt},
475
- {"role": "assistant", "content": "I will extract the conclusion following my instructions."}
476
- ]
477
- instruction_conclusion_summary = self.step(messages=messages)
478
- step_num = step_num + 1
479
- else:
480
- instruction_conclusion_summary = introduction
481
-
482
- prompt = "**title**: {}\n\n**brief conclusion**: {}\n\n**conclusion**: \n\n{}**introduction**: {}".format(self.paper_contents['title'], conclusion, conclusion_text, instruction_conclusion_summary)
483
- messages = [
484
- {"role": "system", "content": SCHOLAR_PROMPT},
485
- {"role": "system", "content": CONCLUSION_PROMT},
486
- {"role": "user", "content": prompt},
487
- {"role": "assistant", "content": "I will extract the conclusions following my instructions."}
488
- ]
489
- evidences = self.step(messages=messages)
490
- step_num = step_num + 1
491
- return evidences, step_num
492
-
493
- def support_experiment_results(self, main_results: str, paragraph_list: list):
494
- step_num = 0
495
- prompt = "**title**: {}\n\n**main results**: {}\n\n".format(self.paper_contents['title'], main_results)
496
- iterative_sys_prompt = RESULT_PROMPT_DICT['iterative_prompt']
497
- messages = [
498
- {"role": "system", "content": SCHOLAR_PROMPT},
499
- {"role": "system", "content": RESULT_PROMPT_DICT['system_instruction']},
500
- {"role": "user", "content": prompt},
501
- {"role": "system", "content": iterative_sys_prompt},
502
- ]
503
-
504
- follow_instruction = {"role": "assistant", "content": "I will extract the experimental information following my instructions."}
505
-
506
- paragraph_summary_array = []
507
- for para_idx in range(len(paragraph_list)):
508
- para_input_prompt = "Paragraph title: {}\n\nContent: {}\n\n".format(paragraph_list[para_idx]['title'], paragraph_list[para_idx]['content'])
509
- user_input = {'role': 'user', 'content': para_input_prompt}
510
- messages.append(user_input)
511
- messages.append(follow_instruction)
512
- para_summary = self.step(messages=messages)
513
- step_num = step_num + 1
514
- paragraph_summary_array.append(para_summary)
515
- messages.pop()
516
- messages.pop()
517
-
518
- ## Experimental result summary
519
-
520
- prompt = "**title**: {}\n\n**main results**: {}\n\n".format(self.paper_contents['title'], main_results)
521
- summary_prompt = '\n'.join(['**summary** {}:\n\n{}'.format(idx+1, summary) for idx, summary in enumerate(paragraph_summary_array)])
522
- input_prompt = prompt + summary_prompt
523
-
524
- messages = [
525
- {"role": "system", "content": SCHOLAR_PROMPT},
526
- {"role": "system", "content": RESULT_PROMPT_DICT['final_prompt']},
527
- {"role": "user", "content": input_prompt},
528
- {"role": "assistant", "content": "I will summarize the experimental results following my instructions."},
529
- ]
530
-
531
- result_summary = self.step(messages=messages)
532
- step_num = step_num + 1
533
- return result_summary, step_num
534
-
535
- def experiment_paragraph_extraction(self,):
536
- intro_idx = self.paper_contents['structure']['Introduction'][0]
537
- conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
538
- experiment_idx_array = self.paper_contents['structure']['Experiments']
539
- if len(experiment_idx_array) == 0:
540
- experiment_idx_array = [_ for _ in range(intro_idx+1, conclusion_idx)]
541
- assert len(experiment_idx_array) > 0 and max(experiment_idx_array) < len(self.paper_contents['main_text'])
542
- experiment_idx_array = [intro_idx] + experiment_idx_array
543
- paragraphs = [self.paper_contents['main_text'][_] for _ in experiment_idx_array]
544
- return paragraphs
545
-
546
- def support_methodology(self, method_overview: str, paragraph_list: list):
547
- step_num = 0
548
- prompt = "**title**: {}\n\n**method overview**: {}\n\n".format(self.paper_contents['title'], method_overview)
549
- iterative_sys_prompt = METHOD_PROMPT_DICT['iterative_prompt']
550
- messages = [
551
- {"role": "system", "content": SCHOLAR_PROMPT},
552
- {"role": "system", "content": METHOD_PROMPT_DICT['system_instruction']},
553
- {"role": "user", "content": prompt},
554
- {"role": "system", "content": iterative_sys_prompt},
555
- ]
556
-
557
- follow_instruction = {"role": "assistant", "content": "I will extract the method information following my instructions."}
558
-
559
- method_summary_array = []
560
- for para_idx in range(len(paragraph_list)):
561
- para_input_prompt = "Paragraph title: {}\n\nContent: {}\n\n".format(paragraph_list[para_idx]['title'], paragraph_list[para_idx]['content'])
562
- user_input = {'role': 'user', 'content': para_input_prompt}
563
- messages.append(user_input)
564
- messages.append(follow_instruction)
565
- method_summary = self.step(messages=messages)
566
- step_num = step_num + 1
567
- method_summary_array.append(method_summary)
568
- messages.pop()
569
- messages.pop()
570
-
571
- ## Method summary
572
- prompt = "**title**: {}\n\n**method overview**: {}\n\n".format(self.paper_contents['title'], method_overview)
573
- method_summary_prompt = '\n'.join(['**method summary** {}:\n\n{}'.format(idx+1, summary) for idx, summary in enumerate(method_summary_array)])
574
- input_prompt = prompt + method_summary_prompt
575
-
576
- messages = [
577
- {"role": "system", "content": SCHOLAR_PROMPT},
578
- {"role": "system", "content": METHOD_PROMPT_DICT['final_prompt']},
579
- {"role": "user", "content": input_prompt},
580
- {"role": "assistant", "content": "I will generate a step-by-step method summary following my instructions."},
581
- ]
582
- method_summary = self.step(messages=messages)
583
- step_num = step_num + 1
584
- return method_summary, step_num
585
-
586
- def method_paragraph_extraction(self,):
587
- intro_idx = self.paper_contents['structure']['Introduction'][0]
588
- conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
589
- method_idx_array = self.paper_contents['structure']['Methods']
590
- if len(method_idx_array) == 0:
591
- method_idx_array = [_ for _ in range(intro_idx+1, conclusion_idx)]
592
- assert len(method_idx_array) > 0 and max(method_idx_array) < len(self.paper_contents['main_text'])
593
- method_idx_array = [intro_idx] + method_idx_array
594
- paragraphs = [self.paper_contents['main_text'][_] for _ in method_idx_array]
595
- return paragraphs
596
-
597
- def generate_slides(self, verbose=False, revision=True):
598
- ## Step 1: Paper content extraction
599
- intro_idx = self.paper_contents['structure']['Introduction'][0]
600
- introduction = self.paper_contents['main_text'][intro_idx]['content']
601
- assert len(introduction) > 512, 'introduction = {}, content = {}'.format(introduction, self.paper_contents['main_text'])
602
- conclusion_idx = self.paper_contents['structure']['Conclusion'][0]
603
- conclusion = self.paper_contents['main_text'][conclusion_idx]['content']
604
- assert len(conclusion) > 128, 'conclusion = {}, content = {}'.format(introduction, self.paper_contents['main_text'])
605
- method_paragraphs = self.method_paragraph_extraction()
606
- experiment_paragraphs = self.experiment_paragraph_extraction()
607
-
608
- start_time = time.time()
609
- ## Step 2: slides structure extraction from abstract
610
- model_call_number = 0
611
- print('Slides structure generation')
612
- slides = {'Title': self.paper_contents['title']}
613
- outline_dict = self.abstract_summary()
614
- model_call_number += 1
615
- slides['Outline'] = outline_dict
616
-
617
- print('Slides generation...')
618
- background = outline_dict.get('Background', '')
619
- slides['Background'], b_steps = self.support_background(background=background, introduction=introduction)
620
- model_call_number += b_steps
621
-
622
- research_problem = outline_dict.get('Research problem', '')
623
- slides['Research problem'], r_steps = self.support_research_problem(research_problem=research_problem, introduction=introduction)
624
- model_call_number += r_steps
625
-
626
- objectives = outline_dict.get('Objectives', '')
627
- slides['Objectives'], o_steps = self.support_objectives(objectives=objectives, introduction=introduction)
628
- model_call_number += o_steps
629
-
630
- brief_conclusion = outline_dict.get('Conclusions', '')
631
- slides['Conclusions'], c_steps = self.support_conclusion(conclusion=brief_conclusion, introduction=introduction, conclusion_text=conclusion, step_wise=True)
632
- model_call_number += c_steps
633
-
634
- results = outline_dict.get('Results', '')
635
- result_summary, res_steps = self.support_experiment_results(main_results=results, paragraph_list=experiment_paragraphs)
636
- slides['Results'] = result_summary
637
- model_call_number += res_steps
638
-
639
- methodology = outline_dict.get('Methodology', '')
640
- method_summary, m_steps = self.support_methodology(method_overview=methodology, paragraph_list=method_paragraphs)
641
- model_call_number += m_steps
642
- slides['Methodology'] = method_summary
643
- runtime = time.time() - start_time
644
- print('Slide generation takes {:.4f} seconds with {} function calls'.format(runtime, model_call_number))
645
- if verbose:
646
- slides_content = self.slides2markdown_v2(slides=slides)
647
- if revision:
648
- slides_content = self.slides_revision(slide_content=slides_content)
649
- slides_array = markdown_to_slide_dicts(full_markdown=slides_content)
650
- revised_slides = {k: v for d in slides_array for k, v in d.items()}
651
- if verbose:
652
- print('Json format:\n{}'.format(json.dumps(revised_slides, indent=4)))
653
- print('\n' * 3)
654
- print('paper keywords:\n{}'.format(self.paper_contents.keys()))
655
- return revised_slides
656
- if verbose:
657
- print('Generated slides:\n{}'.format(slides_content))
658
- print('Json format:\n{}'.format(json.dumps(slides, indent=4)))
659
- return slides
660
-
661
- def slides_revision(self, slide_content: str):
662
- messages = [
663
- {"role": "system", "content": SLIDES_REVISION_PROMPT},
664
- {"role": "user", "content": slide_content},
665
- {"role": "assistant", "content": "I will revise the representation slides following my instructions."}
666
- ]
667
- print('Slides final revision')
668
- revised_slides = make_api_call(model=self.revise_model, messages=messages, max_tokens=2048, temperature=self.temprature)
669
- return revised_slides
670
-
671
- def slides2markdown(self, slides: dict):
672
- slides_content = ''
673
- slides_content += '**Title**\n{}\n\n'.format(slides['Title'])
674
- slides_content += '{}\n'.format(SLIDE_SEP)
675
- slides_content += '**Outline**\n\n'
676
- outline_dict = slides['Outline']
677
- for sect_name, sect_content in outline_dict.items():
678
- slides_content += '{}\n--\t\t{}\n\n'.format(sect_name, sect_content)
679
- slides_content += '{}\n'.format(SLIDE_SEP)
680
- for sect_name in outline_dict.keys():
681
- if sect_name in slides:
682
- slides_content += '**{}**\n\n'.format(sect_name)
683
- slides_content += '{}\n\n'.format(slides[sect_name])
684
- slides_content += '{}\n'.format(SLIDE_SEP)
685
- return slides_content
686
-
687
- def slides2markdown_v2(self, slides: dict, indent=0):
688
- slides_content = dict_to_markdown_list(d=slides, indent=indent)
689
- return slides_content
690
-
691
- def save_to_slides(self, slides: dict, logo_path='logo.png', file_name='slides.pptx'):
692
- authors = self.paper_contents.get('author', None)
693
- if isinstance(authors, list):
694
- authors = authors[0]
695
- else:
696
- authors = None
697
- # print('authors', authors)
698
- dict2ppt = Dict2PPT(logo_path=logo_path)
699
- dict2ppt.build_slides(slide_dict=slides, authors=authors)
700
- dict2ppt.save(file_name=file_name)
701
- full_path = os.path.abspath(file_name)
702
- return full_path
703
-