wangrongsheng commited on
Commit
3b1dde8
·
1 Parent(s): 888dec6

优化:增加token使用统计

Browse files
Files changed (1) hide show
  1. app.py +100 -49
app.py CHANGED
@@ -11,6 +11,8 @@ import fitz, io, os
11
  from PIL import Image
12
  import gradio
13
  import markdown
 
 
14
 
15
  def parse_text(text):
16
  lines = text.split("\n")
@@ -319,6 +321,8 @@ class Reader:
319
  self.gitee_key = self.config.get('Gitee', 'api')
320
  else:
321
  self.gitee_key = ''
 
 
322
 
323
  def get_arxiv(self, max_results=30):
324
  search = arxiv.Search(query=self.query,
@@ -436,6 +440,9 @@ class Reader:
436
 
437
  def summary_with_chat(self, paper_list, key):
438
  htmls = []
 
 
 
439
  for paper_index, paper in enumerate(paper_list):
440
  # 第一步先用title,abs,和introduction进行总结。
441
  text = ''
@@ -444,23 +451,12 @@ class Reader:
444
  text += 'Abstrat:' + paper.abs
445
  # intro
446
  text += list(paper.section_text_dict.values())[0]
447
- max_token = 2500 * 4
448
- text = text[:max_token]
449
- chat_summary_text = self.chat_summary(text=text, key=str(key))
450
  htmls.append(chat_summary_text)
451
 
452
  # TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
453
- first_image, ext = paper.get_image_path()
454
- if first_image is None or self.gitee_key == '':
455
- pass
456
- else:
457
- image_title = self.validateTitle(paper.title)
458
- image_url = self.upload_gitee(image_path=first_image, image_name=image_title, ext=ext)
459
- htmls.append("\n")
460
- htmls.append("![Fig]("+image_url+")")
461
- htmls.append("\n")
462
- # 第二步总结方法:
463
- # TODO,由于有些文章的方法章节名是算法名,所以简单的通过关键词来筛选,很难获取,后面需要用其他的方案去优化。
464
  method_key = ''
465
  for parse_key in paper.section_text_dict.keys():
466
  if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
@@ -473,12 +469,9 @@ class Reader:
473
  summary_text = ''
474
  summary_text += "<summary>" + chat_summary_text
475
  # methods
476
- method_text += paper.section_text_dict[method_key]
477
- # TODO 把这个变成tenacity的自动判别!
478
- max_token = 2500 * 4
479
- text = summary_text + "\n <Methods>:\n" + method_text
480
- text = text[:max_token]
481
- chat_method_text = self.chat_method(text=text, key=str(key))
482
  htmls.append(chat_method_text)
483
  else:
484
  chat_method_text = ''
@@ -497,18 +490,27 @@ class Reader:
497
  summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
498
  if conclusion_key != '':
499
  # conclusion
500
- conclusion_text += paper.section_text_dict[conclusion_key]
501
- max_token = 2500 * 4
502
  text = summary_text + "\n <Conclusion>:\n" + conclusion_text
503
  else:
504
- text = summary_text
505
- text = text[:max_token]
506
- chat_conclusion_text = self.chat_conclusion(text=text, key=str(key))
507
  htmls.append(chat_conclusion_text)
508
  htmls.append("\n")
 
 
 
 
 
 
 
 
 
 
 
509
  md_text = "\n".join(htmls)
510
 
511
- return markdown.markdown(md_text)
512
 
513
 
514
  @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
@@ -516,12 +518,16 @@ class Reader:
516
  reraise=True)
517
  def chat_conclusion(self, text, key):
518
  openai.api_key = key
519
- response = openai.ChatCompletion.create(
520
- model="gpt-3.5-turbo",
521
- # prompt需要用英语替换,少占用token。
522
- messages=[
523
- {"role": "system", "content": "你是一个["+self.key_word+"]领域的审稿人,你需要严格评审这篇文章"}, # chatgpt 角色
524
- {"role": "assistant", "content": "这是一篇英文文献的<summary>和<conclusion>部分内容,其中<summary>你已经总结好了,但是<conclusion>部分,我需要你帮忙归纳下面问题:"+text}, # 背景知识,可以参考OpenReview的审稿流程
 
 
 
 
525
  {"role": "user", "content": """
526
  8. Make the following summary.Be sure to use Chinese answers (proper nouns need to be marked in English).
527
  - (1):What is the significance of this piece of work?
@@ -535,23 +541,39 @@ class Reader:
535
  Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
536
  """},
537
  ]
 
 
 
 
538
  )
 
539
  result = ''
540
  for choice in response.choices:
541
  result += choice.message.content
542
- print("conclusion_result:\n", result)
543
- return result
 
 
 
 
 
 
 
544
 
545
  @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
546
  stop=tenacity.stop_after_attempt(5),
547
  reraise=True)
548
  def chat_method(self, text, key):
549
  openai.api_key = key
550
- response = openai.ChatCompletion.create(
551
- model="gpt-3.5-turbo",
552
- messages=[
553
- {"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
554
- {"role": "assistant", "content": "这是一篇英文文献的<summary>和<Method>部分内容,其中<summary>你已经总结好了,但是<Methods>部分,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
 
 
 
 
555
  {"role": "user", "content": """
556
  7. Describe in detail the methodological idea of this article. Be sure to use Chinese answers (proper nouns need to be marked in English). For example, its steps are.
557
  - (1):...
@@ -568,23 +590,39 @@ class Reader:
568
  Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
569
  """},
570
  ]
 
 
 
571
  )
 
572
  result = ''
573
  for choice in response.choices:
574
  result += choice.message.content
575
  print("method_result:\n", result)
576
- return result
 
 
 
 
 
 
 
 
577
 
578
  @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
579
  stop=tenacity.stop_after_attempt(5),
580
  reraise=True)
581
  def chat_summary(self, text, key):
582
  openai.api_key = key
583
- response = openai.ChatCompletion.create(
584
- model="gpt-3.5-turbo",
585
- messages=[
586
- {"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
587
- {"role": "assistant", "content": "这是一篇英文文献的标题,作者,链接,Abstract和Introduction部分内容,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
 
 
 
 
588
  {"role": "user", "content": """
589
  1. Mark the title of the paper (with Chinese translation)
590
  2. list all the authors' names (use English)
@@ -611,12 +649,25 @@ class Reader:
611
  Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
612
  """},
613
  ]
 
 
 
 
614
  )
 
615
  result = ''
616
  for choice in response.choices:
617
  result += choice.message.content
618
  print("summary_result:\n", result)
619
- return result
 
 
 
 
 
 
 
 
620
 
621
  def export_to_markdown(self, text, file_name, mode='w'):
622
  # 使用markdown模块的convert方法,将文本转换为html格式
@@ -644,8 +695,8 @@ def upload_pdf(key, text, file):
644
  paper_list = [Paper(path=file, sl=section_list)]
645
  # 创建一个Reader对象
646
  reader = Reader()
647
- sum_info = reader.summary_with_chat(paper_list=paper_list, key=key)
648
- return sum_info
649
 
650
  api_title = "api-key可用验证"
651
  api_description = '''<div align='left'>
@@ -692,7 +743,7 @@ ip = [
692
  gradio.inputs.File(label="请上传论文PDF(必填)")
693
  ]
694
 
695
- chatpaper_gui = gradio.Interface(fn=upload_pdf, inputs=ip, outputs="html", title=title, description=description)
696
 
697
  # Start server
698
  gui = gradio.TabbedInterface(interface_list=[api_gui, chatpaper_gui], tab_names=["API-key", "ChatPaper"])
 
11
  from PIL import Image
12
  import gradio
13
  import markdown
14
+ import json
15
+ import tiktoken
16
 
17
  def parse_text(text):
18
  lines = text.split("\n")
 
321
  self.gitee_key = self.config.get('Gitee', 'api')
322
  else:
323
  self.gitee_key = ''
324
+ self.max_token_num = 4096
325
+ self.encoding = tiktoken.get_encoding("gpt2")
326
 
327
  def get_arxiv(self, max_results=30):
328
  search = arxiv.Search(query=self.query,
 
440
 
441
  def summary_with_chat(self, paper_list, key):
442
  htmls = []
443
+ utoken = 0
444
+ ctoken = 0
445
+ ttoken = 0
446
  for paper_index, paper in enumerate(paper_list):
447
  # 第一步先用title,abs,和introduction进行总结。
448
  text = ''
 
451
  text += 'Abstrat:' + paper.abs
452
  # intro
453
  text += list(paper.section_text_dict.values())[0]
454
+ #max_token = 2500 * 4
455
+ #text = text[:max_token]
456
+ chat_summary_text, utoken1, ctoken1, ttoken1 = self.chat_summary(text=text, key=str(key))
457
  htmls.append(chat_summary_text)
458
 
459
  # TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
 
 
 
 
 
 
 
 
 
 
 
460
  method_key = ''
461
  for parse_key in paper.section_text_dict.keys():
462
  if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
 
469
  summary_text = ''
470
  summary_text += "<summary>" + chat_summary_text
471
  # methods
472
+ method_text += paper.section_text_dict[method_key]
473
+ text = summary_text + "\n<Methods>:\n" + method_text
474
+ chat_method_text, utoken2, ctoken2, ttoken2 = self.chat_method(text=text)
 
 
 
475
  htmls.append(chat_method_text)
476
  else:
477
  chat_method_text = ''
 
490
  summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
491
  if conclusion_key != '':
492
  # conclusion
493
+ conclusion_text += paper.section_text_dict[conclusion_key]
 
494
  text = summary_text + "\n <Conclusion>:\n" + conclusion_text
495
  else:
496
+ text = summary_text
497
+ chat_conclusion_text, utoken3, ctoken3, ttoken3 = self.chat_conclusion(text=text)
 
498
  htmls.append(chat_conclusion_text)
499
  htmls.append("\n")
500
+ # token统计
501
+ utoken = utoken + utoken1 + utoken2 + utoken3
502
+ ctoken = ctoken + ctoken1 + ctoken2 + ctoken3
503
+ ttoken = ttoken + ttoken1 + ttoken2 + ttoken3
504
+ cost = (ttoken / 1000) * 0.002
505
+ pos_count = {
506
+ "usage_token_used": str(utoken),
507
+ "completion_token_used": str(ctoken),
508
+ "total_token_used": str(ttoken),
509
+ "cost": str(cost),
510
+ }
511
  md_text = "\n".join(htmls)
512
 
513
+ return markdown.markdown(md_text), pos_count
514
 
515
 
516
  @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
 
518
  reraise=True)
519
  def chat_conclusion(self, text, key):
520
  openai.api_key = key
521
+ self.cur_api += 1
522
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
523
+ conclusion_prompt_token = 650
524
+ text_token = len(self.encoding.encode(text))
525
+ clip_text_index = int(len(text)*(self.max_token_num-conclusion_prompt_token)/text_token)
526
+ clip_text = text[:clip_text_index]
527
+
528
+ messages=[
529
+ {"role": "system", "content": "You are a reviewer in the field of ["+self.key_word+"] and you need to critically review this article"}, # chatgpt 角色
530
+ {"role": "assistant", "content": "This is the <summary> and <conclusion> part of an English literature, where <summary> you have already summarized, but <conclusion> part, I need your help to summarize the following questions:"+clip_text}, # 背景知识,可以参考OpenReview的审稿流程
531
  {"role": "user", "content": """
532
  8. Make the following summary.Be sure to use Chinese answers (proper nouns need to be marked in English).
533
  - (1):What is the significance of this piece of work?
 
541
  Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
542
  """},
543
  ]
544
+ response = openai.ChatCompletion.create(
545
+ model="gpt-3.5-turbo",
546
+ # prompt需要用英语替换,少占用token。
547
+ messages=messages,
548
  )
549
+
550
  result = ''
551
  for choice in response.choices:
552
  result += choice.message.content
553
+ #print("prompt_token_used:", response.usage.prompt_tokens,
554
+ # "completion_token_used:", response.usage.completion_tokens,
555
+ # "total_token_used:", response.usage.total_tokens)
556
+ #print("response_time:", response.response_ms/1000.0, 's')
557
+ usage_token = response.usage.prompt_tokens
558
+ com_token = response.usage.completion_tokens
559
+ total_token = response.usage.total_tokens
560
+
561
+ return result, usage_token, com_token, total_token
562
 
563
  @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
564
  stop=tenacity.stop_after_attempt(5),
565
  reraise=True)
566
  def chat_method(self, text, key):
567
  openai.api_key = key
568
+ self.cur_api += 1
569
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
570
+ method_prompt_token = 650
571
+ text_token = len(self.encoding.encode(text))
572
+ clip_text_index = int(len(text)*(self.max_token_num-method_prompt_token)/text_token)
573
+ clip_text = text[:clip_text_index]
574
+ messages=[
575
+ {"role": "system", "content": "You are a researcher in the field of ["+self.key_word+"] who is good at summarizing papers using concise statements"}, # chatgpt 角色
576
+ {"role": "assistant", "content": "This is the <summary> and <Method> part of an English document, where <summary> you have summarized, but the <Methods> part, I need your help to read and summarize the following questions."+clip_text}, # 背景知识
577
  {"role": "user", "content": """
578
  7. Describe in detail the methodological idea of this article. Be sure to use Chinese answers (proper nouns need to be marked in English). For example, its steps are.
579
  - (1):...
 
590
  Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not repeat the content of the previous <summary>, the value of the use of the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed, ....... means fill in according to the actual requirements, if not, you can not write.
591
  """},
592
  ]
593
+ response = openai.ChatCompletion.create(
594
+ model="gpt-3.5-turbo",
595
+ messages=messages,
596
  )
597
+
598
  result = ''
599
  for choice in response.choices:
600
  result += choice.message.content
601
  print("method_result:\n", result)
602
+ #print("prompt_token_used:", response.usage.prompt_tokens,
603
+ # "completion_token_used:", response.usage.completion_tokens,
604
+ # "total_token_used:", response.usage.total_tokens)
605
+ #print("response_time:", response.response_ms/1000.0, 's')
606
+ usage_token = response.usage.prompt_tokens
607
+ com_token = response.usage.completion_tokens
608
+ total_token = response.usage.total_tokens
609
+
610
+ return result, usage_token, com_token, total_token
611
 
612
  @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
613
  stop=tenacity.stop_after_attempt(5),
614
  reraise=True)
615
  def chat_summary(self, text, key):
616
  openai.api_key = key
617
+ self.cur_api += 1
618
+ self.cur_api = 0 if self.cur_api >= len(self.chat_api_list)-1 else self.cur_api
619
+ summary_prompt_token = 1000
620
+ text_token = len(self.encoding.encode(text))
621
+ clip_text_index = int(len(text)*(self.max_token_num-summary_prompt_token)/text_token)
622
+ clip_text = text[:clip_text_index]
623
+ messages=[
624
+ {"role": "system", "content": "You are a researcher in the field of ["+self.key_word+"] who is good at summarizing papers using concise statements"},
625
+ {"role": "assistant", "content": "This is the title, author, link, abstract and introduction of an English document. I need your help to read and summarize the following questions: "+clip_text},
626
  {"role": "user", "content": """
627
  1. Mark the title of the paper (with Chinese translation)
628
  2. list all the authors' names (use English)
 
649
  Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
650
  """},
651
  ]
652
+
653
+ response = openai.ChatCompletion.create(
654
+ model="gpt-3.5-turbo",
655
+ messages=messages,
656
  )
657
+
658
  result = ''
659
  for choice in response.choices:
660
  result += choice.message.content
661
  print("summary_result:\n", result)
662
+ #print("prompt_token_used:", response.usage.prompt_tokens,
663
+ # "completion_token_used:", response.usage.completion_tokens,
664
+ # "total_token_used:", response.usage.total_tokens)
665
+ #print("response_time:", response.response_ms/1000.0, 's')
666
+ usage_token = response.usage.prompt_tokens
667
+ com_token = response.usage.completion_tokens
668
+ total_token = response.usage.total_tokens
669
+
670
+ return result, usage_token, com_token, total_token
671
 
672
  def export_to_markdown(self, text, file_name, mode='w'):
673
  # 使用markdown模块的convert方法,将文本转换为html格式
 
695
  paper_list = [Paper(path=file, sl=section_list)]
696
  # 创建一个Reader对象
697
  reader = Reader()
698
+ sum_info, cost = reader.summary_with_chat(paper_list=paper_list, key=key)
699
+ return cost, sum_info
700
 
701
  api_title = "api-key可用验证"
702
  api_description = '''<div align='left'>
 
743
  gradio.inputs.File(label="请上传论文PDF(必填)")
744
  ]
745
 
746
+ chatpaper_gui = gradio.Interface(fn=upload_pdf, inputs=ip, outputs=["json", "html"], title=title, description=description)
747
 
748
  # Start server
749
  gui = gradio.TabbedInterface(interface_list=[api_gui, chatpaper_gui], tab_names=["API-key", "ChatPaper"])