Spaces:
Running
Running
wangrongsheng
commited on
Commit
·
3b1dde8
1
Parent(s):
888dec6
优化:增加token使用统计
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ import fitz, io, os
|
|
11 |
from PIL import Image
|
12 |
import gradio
|
13 |
import markdown
|
|
|
|
|
14 |
|
15 |
def parse_text(text):
|
16 |
lines = text.split("\n")
|
@@ -319,6 +321,8 @@ class Reader:
|
|
319 |
self.gitee_key = self.config.get('Gitee', 'api')
|
320 |
else:
|
321 |
self.gitee_key = ''
|
|
|
|
|
322 |
|
323 |
def get_arxiv(self, max_results=30):
|
324 |
search = arxiv.Search(query=self.query,
|
@@ -436,6 +440,9 @@ class Reader:
|
|
436 |
|
437 |
def summary_with_chat(self, paper_list, key):
|
438 |
htmls = []
|
|
|
|
|
|
|
439 |
for paper_index, paper in enumerate(paper_list):
|
440 |
# 第一步先用title,abs,和introduction进行总结。
|
441 |
text = ''
|
@@ -444,23 +451,12 @@ class Reader:
|
|
444 |
text += 'Abstrat:' + paper.abs
|
445 |
# intro
|
446 |
text += list(paper.section_text_dict.values())[0]
|
447 |
-
max_token = 2500 * 4
|
448 |
-
text = text[:max_token]
|
449 |
-
chat_summary_text = self.chat_summary(text=text, key=str(key))
|
450 |
htmls.append(chat_summary_text)
|
451 |
|
452 |
# TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
|
453 |
-
first_image, ext = paper.get_image_path()
|
454 |
-
if first_image is None or self.gitee_key == '':
|
455 |
-
pass
|
456 |
-
else:
|
457 |
-
image_title = self.validateTitle(paper.title)
|
458 |
-
image_url = self.upload_gitee(image_path=first_image, image_name=image_title, ext=ext)
|
459 |
-
htmls.append("\n")
|
460 |
-
htmls.append("![Fig]("+image_url+")")
|
461 |
-
htmls.append("\n")
|
462 |
-
# 第二步总结方法:
|
463 |
-
# TODO,由于有些文章的方法章节名是算法名,所以简单的通过关键词来筛选,很难获取,后面需要用其他的方案去优化。
|
464 |
method_key = ''
|
465 |
for parse_key in paper.section_text_dict.keys():
|
466 |
if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
|
@@ -473,12 +469,9 @@ class Reader:
|
|
473 |
summary_text = ''
|
474 |
summary_text += "<summary>" + chat_summary_text
|
475 |
# methods
|
476 |
-
method_text += paper.section_text_dict[method_key]
|
477 |
-
|
478 |
-
|
479 |
-
text = summary_text + "\n <Methods>:\n" + method_text
|
480 |
-
text = text[:max_token]
|
481 |
-
chat_method_text = self.chat_method(text=text, key=str(key))
|
482 |
htmls.append(chat_method_text)
|
483 |
else:
|
484 |
chat_method_text = ''
|
@@ -497,18 +490,27 @@ class Reader:
|
|
497 |
summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
|
498 |
if conclusion_key != '':
|
499 |
# conclusion
|
500 |
-
conclusion_text += paper.section_text_dict[conclusion_key]
|
501 |
-
max_token = 2500 * 4
|
502 |
text = summary_text + "\n <Conclusion>:\n" + conclusion_text
|
503 |
else:
|
504 |
-
text = summary_text
|
505 |
-
|
506 |
-
chat_conclusion_text = self.chat_conclusion(text=text, key=str(key))
|
507 |
htmls.append(chat_conclusion_text)
|
508 |
htmls.append("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
md_text = "\n".join(htmls)
|
510 |
|
511 |
-
return markdown.markdown(md_text)
|
512 |
|
513 |
|
514 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
@@ -516,12 +518,16 @@ class Reader:
|
|
516 |
reraise=True)
|
517 |
def chat_conclusion(self, text, key):
|
518 |
openai.api_key = key
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
|
|
|
|
|
|
|
|
525 |
{"role": "user", "content": """
|
526 |
8. Make the following summary.Be sure to use Chinese answers (proper nouns need to be marked in English).
|
527 |
- (1):What is the significance of this piece of work?
|
@@ -535,23 +541,39 @@ class Reader:
|
|
535 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
536 |
"""},
|
537 |
]
|
|
|
|
|
|
|
|
|
538 |
)
|
|
|
539 |
result = ''
|
540 |
for choice in response.choices:
|
541 |
result += choice.message.content
|
542 |
-
print("
|
543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
546 |
stop=tenacity.stop_after_attempt(5),
|
547 |
reraise=True)
|
548 |
def chat_method(self, text, key):
|
549 |
openai.api_key = key
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
|
|
|
|
|
|
|
|
555 |
{"role": "user", "content": """
|
556 |
7. Describe in detail the methodological idea of this article. Be sure to use Chinese answers (proper nouns need to be marked in English). For example, its steps are.
|
557 |
- (1):...
|
@@ -568,23 +590,39 @@ class Reader:
|
|
568 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
569 |
"""},
|
570 |
]
|
|
|
|
|
|
|
571 |
)
|
|
|
572 |
result = ''
|
573 |
for choice in response.choices:
|
574 |
result += choice.message.content
|
575 |
print("method_result:\n", result)
|
576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
|
578 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
579 |
stop=tenacity.stop_after_attempt(5),
|
580 |
reraise=True)
|
581 |
def chat_summary(self, text, key):
|
582 |
openai.api_key = key
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
|
|
|
|
|
|
|
|
588 |
{"role": "user", "content": """
|
589 |
1. Mark the title of the paper (with Chinese translation)
|
590 |
2. list all the authors' names (use English)
|
@@ -611,12 +649,25 @@ class Reader:
|
|
611 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
|
612 |
"""},
|
613 |
]
|
|
|
|
|
|
|
|
|
614 |
)
|
|
|
615 |
result = ''
|
616 |
for choice in response.choices:
|
617 |
result += choice.message.content
|
618 |
print("summary_result:\n", result)
|
619 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
620 |
|
621 |
def export_to_markdown(self, text, file_name, mode='w'):
|
622 |
# 使用markdown模块的convert方法,将文本转换为html格式
|
@@ -644,8 +695,8 @@ def upload_pdf(key, text, file):
|
|
644 |
paper_list = [Paper(path=file, sl=section_list)]
|
645 |
# 创建一个Reader对象
|
646 |
reader = Reader()
|
647 |
-
sum_info = reader.summary_with_chat(paper_list=paper_list, key=key)
|
648 |
-
return sum_info
|
649 |
|
650 |
api_title = "api-key可用验证"
|
651 |
api_description = '''<div align='left'>
|
@@ -692,7 +743,7 @@ ip = [
|
|
692 |
gradio.inputs.File(label="请上传论文PDF(必填)")
|
693 |
]
|
694 |
|
695 |
-
chatpaper_gui = gradio.Interface(fn=upload_pdf, inputs=ip, outputs="html", title=title, description=description)
|
696 |
|
697 |
# Start server
|
698 |
gui = gradio.TabbedInterface(interface_list=[api_gui, chatpaper_gui], tab_names=["API-key", "ChatPaper"])
|
|
|
11 |
from PIL import Image
|
12 |
import gradio
|
13 |
import markdown
|
14 |
+
import json
|
15 |
+
import tiktoken
|
16 |
|
17 |
def parse_text(text):
|
18 |
lines = text.split("\n")
|
|
|
321 |
self.gitee_key = self.config.get('Gitee', 'api')
|
322 |
else:
|
323 |
self.gitee_key = ''
|
324 |
+
self.max_token_num = 4096
|
325 |
+
self.encoding = tiktoken.get_encoding("gpt2")
|
326 |
|
327 |
def get_arxiv(self, max_results=30):
|
328 |
search = arxiv.Search(query=self.query,
|
|
|
440 |
|
441 |
def summary_with_chat(self, paper_list, key):
|
442 |
htmls = []
|
443 |
+
utoken = 0
|
444 |
+
ctoken = 0
|
445 |
+
ttoken = 0
|
446 |
for paper_index, paper in enumerate(paper_list):
|
447 |
# 第一步先用title,abs,和introduction进行总结。
|
448 |
text = ''
|
|
|
451 |
text += 'Abstrat:' + paper.abs
|
452 |
# intro
|
453 |
text += list(paper.section_text_dict.values())[0]
|
454 |
+
#max_token = 2500 * 4
|
455 |
+
#text = text[:max_token]
|
456 |
+
chat_summary_text, utoken1, ctoken1, ttoken1 = self.chat_summary(text=text, key=str(key))
|
457 |
htmls.append(chat_summary_text)
|
458 |
|
459 |
# TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
method_key = ''
|
461 |
for parse_key in paper.section_text_dict.keys():
|
462 |
if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
|
|
|
469 |
summary_text = ''
|
470 |
summary_text += "<summary>" + chat_summary_text
|
471 |
# methods
|
472 |
+
method_text += paper.section_text_dict[method_key]
|
473 |
+
text = summary_text + "\n<Methods>:\n" + method_text
|
474 |
+
chat_method_text, utoken2, ctoken2, ttoken2 = self.chat_method(text=text)
|
|
|
|
|
|
|
475 |
htmls.append(chat_method_text)
|
476 |
else:
|
477 |
chat_method_text = ''
|
|
|
490 |
summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
|
491 |
if conclusion_key != '':
|
492 |
# conclusion
|
493 |
+
conclusion_text += paper.section_text_dict[conclusion_key]
|
|
|
494 |
text = summary_text + "\n <Conclusion>:\n" + conclusion_text
|
495 |
else:
|
496 |
+
text = summary_text
|
497 |
+
chat_conclusion_text, utoken3, ctoken3, ttoken3 = self.chat_conclusion(text=text)
|
|
|
498 |
htmls.append(chat_conclusion_text)
|
499 |
htmls.append("\n")
|
500 |
+
# token统计
|
501 |
+
utoken = utoken + utoken1 + utoken2 + utoken3
|
502 |
+
ctoken = ctoken + ctoken1 + ctoken2 + ctoken3
|
503 |
+
ttoken = ttoken + ttoken1 + ttoken2 + ttoken3
|
504 |
+
cost = (ttoken / 1000) * 0.002
|
505 |
+
pos_count = {
|
506 |
+
"usage_token_used": str(utoken),
|
507 |
+
"completion_token_used": str(ctoken),
|
508 |
+
"total_token_used": str(ttoken),
|
509 |
+
"cost": str(cost),
|
510 |
+
}
|
511 |
md_text = "\n".join(htmls)
|
512 |
|
513 |
+
return markdown.markdown(md_text), pos_count
|
514 |
|
515 |
|
516 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
|
|
518 |
reraise=True)
|
519 |
def chat_conclusion(self, text, key):
|
520 |
openai.api_key = key
|
521 |
+
self.cur_api += 1
|
522 |
+
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
|
523 |
+
conclusion_prompt_token = 650
|
524 |
+
text_token = len(self.encoding.encode(text))
|
525 |
+
clip_text_index = int(len(text)*(self.max_token_num-conclusion_prompt_token)/text_token)
|
526 |
+
clip_text = text[:clip_text_index]
|
527 |
+
|
528 |
+
messages=[
|
529 |
+
{"role": "system", "content": "You are a reviewer in the field of ["+self.key_word+"] and you need to critically review this article"}, # chatgpt 角色
|
530 |
+
{"role": "assistant", "content": "This is the <summary> and <conclusion> part of an English literature, where <summary> you have already summarized, but <conclusion> part, I need your help to summarize the following questions:"+clip_text}, # 背景知识,可以参考OpenReview的审稿流程
|
531 |
{"role": "user", "content": """
|
532 |
8. Make the following summary.Be sure to use Chinese answers (proper nouns need to be marked in English).
|
533 |
- (1):What is the significance of this piece of work?
|
|
|
541 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
542 |
"""},
|
543 |
]
|
544 |
+
response = openai.ChatCompletion.create(
|
545 |
+
model="gpt-3.5-turbo",
|
546 |
+
# prompt需要用英语替换,少占用token。
|
547 |
+
messages=messages,
|
548 |
)
|
549 |
+
|
550 |
result = ''
|
551 |
for choice in response.choices:
|
552 |
result += choice.message.content
|
553 |
+
#print("prompt_token_used:", response.usage.prompt_tokens,
|
554 |
+
# "completion_token_used:", response.usage.completion_tokens,
|
555 |
+
# "total_token_used:", response.usage.total_tokens)
|
556 |
+
#print("response_time:", response.response_ms/1000.0, 's')
|
557 |
+
usage_token = response.usage.prompt_tokens
|
558 |
+
com_token = response.usage.completion_tokens
|
559 |
+
total_token = response.usage.total_tokens
|
560 |
+
|
561 |
+
return result, usage_token, com_token, total_token
|
562 |
|
563 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
564 |
stop=tenacity.stop_after_attempt(5),
|
565 |
reraise=True)
|
566 |
def chat_method(self, text, key):
|
567 |
openai.api_key = key
|
568 |
+
self.cur_api += 1
|
569 |
+
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
|
570 |
+
method_prompt_token = 650
|
571 |
+
text_token = len(self.encoding.encode(text))
|
572 |
+
clip_text_index = int(len(text)*(self.max_token_num-method_prompt_token)/text_token)
|
573 |
+
clip_text = text[:clip_text_index]
|
574 |
+
messages=[
|
575 |
+
{"role": "system", "content": "You are a researcher in the field of ["+self.key_word+"] who is good at summarizing papers using concise statements"}, # chatgpt 角色
|
576 |
+
{"role": "assistant", "content": "This is the <summary> and <Method> part of an English document, where <summary> you have summarized, but the <Methods> part, I need your help to read and summarize the following questions."+clip_text}, # 背景知识
|
577 |
{"role": "user", "content": """
|
578 |
7. Describe in detail the methodological idea of this article. Be sure to use Chinese answers (proper nouns need to be marked in English). For example, its steps are.
|
579 |
- (1):...
|
|
|
590 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
|
591 |
"""},
|
592 |
]
|
593 |
+
response = openai.ChatCompletion.create(
|
594 |
+
model="gpt-3.5-turbo",
|
595 |
+
messages=messages,
|
596 |
)
|
597 |
+
|
598 |
result = ''
|
599 |
for choice in response.choices:
|
600 |
result += choice.message.content
|
601 |
print("method_result:\n", result)
|
602 |
+
#print("prompt_token_used:", response.usage.prompt_tokens,
|
603 |
+
# "completion_token_used:", response.usage.completion_tokens,
|
604 |
+
# "total_token_used:", response.usage.total_tokens)
|
605 |
+
#print("response_time:", response.response_ms/1000.0, 's')
|
606 |
+
usage_token = response.usage.prompt_tokens
|
607 |
+
com_token = response.usage.completion_tokens
|
608 |
+
total_token = response.usage.total_tokens
|
609 |
+
|
610 |
+
return result, usage_token, com_token, total_token
|
611 |
|
612 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
613 |
stop=tenacity.stop_after_attempt(5),
|
614 |
reraise=True)
|
615 |
def chat_summary(self, text, key):
|
616 |
openai.api_key = key
|
617 |
+
self.cur_api += 1
|
618 |
+
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
|
619 |
+
summary_prompt_token = 1000
|
620 |
+
text_token = len(self.encoding.encode(text))
|
621 |
+
clip_text_index = int(len(text)*(self.max_token_num-summary_prompt_token)/text_token)
|
622 |
+
clip_text = text[:clip_text_index]
|
623 |
+
messages=[
|
624 |
+
{"role": "system", "content": "You are a researcher in the field of ["+self.key_word+"] who is good at summarizing papers using concise statements"},
|
625 |
+
{"role": "assistant", "content": "This is the title, author, link, abstract and introduction of an English document. I need your help to read and summarize the following questions: "+clip_text},
|
626 |
{"role": "user", "content": """
|
627 |
1. Mark the title of the paper (with Chinese translation)
|
628 |
2. list all the authors' names (use English)
|
|
|
649 |
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
|
650 |
"""},
|
651 |
]
|
652 |
+
|
653 |
+
response = openai.ChatCompletion.create(
|
654 |
+
model="gpt-3.5-turbo",
|
655 |
+
messages=messages,
|
656 |
)
|
657 |
+
|
658 |
result = ''
|
659 |
for choice in response.choices:
|
660 |
result += choice.message.content
|
661 |
print("summary_result:\n", result)
|
662 |
+
#print("prompt_token_used:", response.usage.prompt_tokens,
|
663 |
+
# "completion_token_used:", response.usage.completion_tokens,
|
664 |
+
# "total_token_used:", response.usage.total_tokens)
|
665 |
+
#print("response_time:", response.response_ms/1000.0, 's')
|
666 |
+
usage_token = response.usage.prompt_tokens
|
667 |
+
com_token = response.usage.completion_tokens
|
668 |
+
total_token = response.usage.total_tokens
|
669 |
+
|
670 |
+
return result, usage_token, com_token, total_token
|
671 |
|
672 |
def export_to_markdown(self, text, file_name, mode='w'):
|
673 |
# 使用markdown模块的convert方法,将文本转换为html格式
|
|
|
695 |
paper_list = [Paper(path=file, sl=section_list)]
|
696 |
# 创建一个Reader对象
|
697 |
reader = Reader()
|
698 |
+
sum_info, cost = reader.summary_with_chat(paper_list=paper_list, key=key)
|
699 |
+
return cost, sum_info
|
700 |
|
701 |
api_title = "api-key可用验证"
|
702 |
api_description = '''<div align='left'>
|
|
|
743 |
gradio.inputs.File(label="请上传论文PDF(必填)")
|
744 |
]
|
745 |
|
746 |
+
chatpaper_gui = gradio.Interface(fn=upload_pdf, inputs=ip, outputs=["json", "html"], title=title, description=description)
|
747 |
|
748 |
# Start server
|
749 |
gui = gradio.TabbedInterface(interface_list=[api_gui, chatpaper_gui], tab_names=["API-key", "ChatPaper"])
|