gpt-academic

Sleeping

App Files Files Community

qingxu99 commited on Apr 14, 2023

Commit

dd648bd

1 Parent(s): a2002eb

disallow special token + limit num of file < 512

Browse files

Files changed (8) hide show

crazy_functions/Latex全文润色.py +1 -1
crazy_functions/Latex全文翻译.py +1 -1
crazy_functions/crazy_utils.py +2 -2
crazy_functions/代码重写为全英文_多线程.py +1 -1
crazy_functions/批量Markdown翻译.py +1 -1
crazy_functions/批量翻译PDF文档_多线程.py +1 -1
crazy_functions/理解PDF文档内容.py +1 -1
crazy_functions/解析项目源代码.py +2 -1

crazy_functions/Latex全文润色.py CHANGED Viewed

@@ -14,7 +14,7 @@ class PaperFileGroup():
         import tiktoken
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-        def get_token_num(txt): return len(enc.encode(txt))
         self.get_token_num = get_token_num
     def run_file_split(self, max_token_limit=1900):

         import tiktoken
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         self.get_token_num = get_token_num
     def run_file_split(self, max_token_limit=1900):

crazy_functions/Latex全文翻译.py CHANGED Viewed

@@ -14,7 +14,7 @@ class PaperFileGroup():
         import tiktoken
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-        def get_token_num(txt): return len(enc.encode(txt))
         self.get_token_num = get_token_num
     def run_file_split(self, max_token_limit=1900):

         import tiktoken
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         self.get_token_num = get_token_num
     def run_file_split(self, max_token_limit=1900):

crazy_functions/crazy_utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ def input_clipping(inputs, history, max_token_limit):
     import numpy as np
     from toolbox import get_conf
     enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-    def get_token_num(txt): return len(enc.encode(txt))
     mode = 'input-and-history'
     # 当 输入部分的token占比 小于 全文的一半时，只裁剪历史
@@ -23,7 +23,7 @@ def input_clipping(inputs, history, max_token_limit):
     while n_token > max_token_limit:
         where = np.argmax(everything_token)
-        encoded = enc.encode(everything[where])
         clipped_encoded = encoded[:len(encoded)-delta]
         everything[where] = enc.decode(clipped_encoded)[:-1]    # -1 to remove the may-be illegal char
         everything_token[where] = get_token_num(everything[where])

     import numpy as np
     from toolbox import get_conf
     enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
     mode = 'input-and-history'
     # 当 输入部分的token占比 小于 全文的一半时，只裁剪历史
     while n_token > max_token_limit:
         where = np.argmax(everything_token)
+        encoded = enc.encode(everything[where], disallowed_special=())
         clipped_encoded = encoded[:len(encoded)-delta]
         everything[where] = enc.decode(clipped_encoded)[:-1]    # -1 to remove the may-be illegal char
         everything_token[where] = get_token_num(everything[where])

crazy_functions/代码重写为全英文_多线程.py CHANGED Viewed

@@ -62,7 +62,7 @@ def 全项目切换英文(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_
     import tiktoken
     from toolbox import get_conf
     enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-    def get_token_fn(txt): return len(enc.encode(txt))
     # 第6步：任务函数

     import tiktoken
     from toolbox import get_conf
     enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+    def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
     # 第6步：任务函数

crazy_functions/批量Markdown翻译.py CHANGED Viewed

@@ -14,7 +14,7 @@ class PaperFileGroup():
         import tiktoken
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-        def get_token_num(txt): return len(enc.encode(txt))
         self.get_token_num = get_token_num
     def run_file_split(self, max_token_limit=1900):

         import tiktoken
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         self.get_token_num = get_token_num
     def run_file_split(self, max_token_limit=1900):

crazy_functions/批量翻译PDF文档_多线程.py CHANGED Viewed

@@ -70,7 +70,7 @@ def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot,
         from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-        def get_token_num(txt): return len(enc.encode(txt))
         paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
             txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(

         from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
         from toolbox import get_conf
         enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+        def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
         paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
             txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
         page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(

crazy_functions/理解PDF文档内容.py CHANGED Viewed

@@ -19,7 +19,7 @@ def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
     from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
     from toolbox import get_conf
     enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
-    def get_token_num(txt): return len(enc.encode(txt))
     paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
     page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(

     from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
     from toolbox import get_conf
     enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+    def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
     paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
         txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
     page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(

crazy_functions/解析项目源代码.py CHANGED Viewed

@@ -11,7 +11,8 @@ def 解析源代码新(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
     history_array = []
     sys_prompt_array = []
     report_part_1 = []
     ############################## <第一步，逐个文件分析，多线程> ##################################
     for index, fp in enumerate(file_manifest):
         with open(fp, 'r', encoding='utf-8', errors='replace') as f:

     history_array = []
     sys_prompt_array = []
     report_part_1 = []
+    assert len(file_manifest) <= 512, "源文件太多, 请缩减输入文件的数量, 或者删除此行并拆分file_manifest以保证结果能被分批存储。"
     ############################## <第一步，逐个文件分析，多线程> ##################################
     for index, fp in enumerate(file_manifest):
         with open(fp, 'r', encoding='utf-8', errors='replace') as f: