gpt-academic

Paused

File size: 7,842 Bytes

61b4ea6
 
3aa446c
61b4ea6
3aa446c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61b4ea6
 
 
 
3aa446c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71d2f01
 
 
 
3aa446c
61b4ea6
 
3aa446c
61b4ea6
 
 
 
3aa446c
61b4ea6
 
 
 
3aa446c
 
 
 
 
 
 
 
 
 
 
61b4ea6
3aa446c
61b4ea6
3aa446c
 
 
61b4ea6
 
3aa446c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61b4ea6
3aa446c
61b4ea6
 
 
 
d0e3ca7
 
 
3aa446c
d0e3ca7
 
3aa446c
 
d0e3ca7
 
3aa446c
 
 
 
 
 
 
d0e3ca7
61b4ea6
3aa446c
61b4ea6
d0e3ca7
61b4ea6
 
 
71d2f01
61b4ea6
3aa446c
 
 
 
 
61b4ea6
 
 
d0e3ca7
61b4ea6
3aa446c
61b4ea6
17d9a06
61b4ea6

import threading
from predict import predict_no_ui_long_connection
from toolbox import CatchException, write_results_to_file, report_execption

def extract_code_block_carefully(txt):
    splitted = txt.split('```')
    n_code_block_seg = len(splitted) - 1
    if n_code_block_seg <= 1: return txt
    # 剩下的情况都开头除去 ``` 结尾除去一次 ```
    txt_out = '```'.join(splitted[1:-1])
    return txt_out

def breakdown_txt_to_satisfy_token_limit(txt, limit, must_break_at_empty_line=True):
    from transformers import GPT2TokenizerFast
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
    def cut(txt_tocut, must_break_at_empty_line): # 递归
        if get_token_cnt(txt_tocut) <= limit:
            return [txt_tocut]
        else:
            lines = txt_tocut.split('\n')
            estimated_line_cut = limit / get_token_cnt(txt_tocut)  * len(lines)
            estimated_line_cut = int(estimated_line_cut)
            for cnt in reversed(range(estimated_line_cut)):
                if must_break_at_empty_line: 
                    if lines[cnt] != "": continue
                print(cnt)
                prev = "\n".join(lines[:cnt])
                post = "\n".join(lines[cnt:])
                if get_token_cnt(prev) < limit: break
            if cnt == 0:
                print('what the f?')
                raise RuntimeError("存在一行极长的文本！")
            print(len(post))
            # 列表递归接龙
            result = [prev]
            result.extend(cut(post, must_break_at_empty_line))
            return result
    try:
        return cut(txt, must_break_at_empty_line=True)
    except RuntimeError:
        return cut(txt, must_break_at_empty_line=False)


def break_txt_into_half_at_some_linebreak(txt):
    lines = txt.split('\n')
    n_lines = len(lines)
    pre = lines[:(n_lines//2)]
    post = lines[(n_lines//2):]
    return "\n".join(pre), "\n".join(post)


@CatchException
def 全项目切换英文(txt, top_p, temperature, chatbot, history, sys_prompt, WEB_PORT):
    # 第1步：清空历史，以免输入溢出
    history = []

    # 第2步：尝试导入依赖，如果缺少依赖，则给出安装建议
    try:
        import openai, transformers
    except:
        report_execption(chatbot, history, 
            a = f"解析项目: {txt}", 
            b = f"导入软件依赖失败。使用该模块需要额外依赖，安装方法```pip install --upgrade openai transformers```。")
        yield chatbot, history, '正常'
        return

    # 第3步：集合文件
    import time, glob, os, shutil, re, openai
    os.makedirs('gpt_log/generated_english_version', exist_ok=True)
    os.makedirs('gpt_log/generated_english_version/crazy_functions', exist_ok=True)
    file_manifest = [f for f in glob.glob('./*.py') if ('test_project' not in f) and ('gpt_log' not in f)] + \
                    [f for f in glob.glob('./crazy_functions/*.py') if ('test_project' not in f) and ('gpt_log' not in f)]
    # file_manifest = ['./toolbox.py']
    i_say_show_user_buffer = []

    # 第4步：随便显示点什么防止卡顿的感觉
    for index, fp in enumerate(file_manifest):
        # if 'test_project' in fp: continue
        with open(fp, 'r', encoding='utf-8') as f:
            file_content = f.read()
        i_say_show_user =f'[{index}/{len(file_manifest)}] 接下来请将以下代码中包含的所有中文转化为英文，只输出转化后的英文代码，请用代码块输出代码: {os.path.abspath(fp)}'
        i_say_show_user_buffer.append(i_say_show_user)
        chatbot.append((i_say_show_user, "[Local Message] 等待多线程操作，中间过程不予显示."))
        yield chatbot, history, '正常'


    # 第5步：Token限制下的截断与处理
    MAX_TOKEN = 2500
    # from transformers import GPT2TokenizerFast
    # print('加载tokenizer中')
    # tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    # get_token_cnt = lambda txt: len(tokenizer(txt)["input_ids"])
    # print('加载tokenizer结束')


    # 第6步：任务函数
    mutable_return = [None for _ in file_manifest]
    observe_window = [[""] for _ in file_manifest]
    def thread_worker(fp,index):
        if index > 10: 
            time.sleep(60)
            print('Openai 限制免费用户每分钟20次请求，降低请求频率中。')
        with open(fp, 'r', encoding='utf-8') as f:
            file_content = f.read()
        i_say_template = lambda fp, file_content: f'接下来请将以下代码中包含的所有中文转化为英文，只输出代码，文件名是{fp}，文件代码是 ```{file_content}```'
        try:
            gpt_say = ""
            # 分解代码文件
            file_content_breakdown = breakdown_txt_to_satisfy_token_limit(file_content, MAX_TOKEN)
            for file_content_partial in file_content_breakdown:
                i_say = i_say_template(fp, file_content_partial)
                # # ** gpt request **
                gpt_say_partial = predict_no_ui_long_connection(inputs=i_say, top_p=top_p, temperature=temperature, history=[], sys_prompt=sys_prompt, observe_window=observe_window[index])
                gpt_say_partial = extract_code_block_carefully(gpt_say_partial)
                gpt_say += gpt_say_partial
            mutable_return[index] = gpt_say
        except ConnectionAbortedError as token_exceed_err:
            print('至少一个线程任务Token溢出而失败', e)
        except Exception as e:
            print('至少一个线程任务意外失败', e)

    # 第7步：所有线程同时开始执行任务函数
    handles = [threading.Thread(target=thread_worker, args=(fp,index)) for index, fp in enumerate(file_manifest)]
    for h in handles:
        h.daemon = True
        h.start()
    chatbot.append(('开始了吗？', f'多线程操作已经开始'))
    yield chatbot, history, '正常'

    # 第8步：循环轮询各个线程是否执行完毕
    cnt = 0
    while True:
        cnt += 1
        time.sleep(0.2)
        th_alive = [h.is_alive() for h in handles]
        if not any(th_alive): break
        # 更好的UI视觉效果
        observe_win = []
        for thread_index, alive in enumerate(th_alive): 
            observe_win.append("[ ..."+observe_window[thread_index][0][-60:].replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"... ]")
        stat = [f'执行中: {obs}\n\n' if alive else '已完成\n\n' for alive, obs in zip(th_alive, observe_win)]
        stat_str = ''.join(stat)
        chatbot[-1] = (chatbot[-1][0], f'多线程操作已经开始，完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1)))
        yield chatbot, history, '正常'

    # 第9步：把结果写入文件
    for index, h in enumerate(handles):
        h.join() # 这里其实不需要join了，肯定已经都结束了
        fp = file_manifest[index]
        gpt_say = mutable_return[index]
        i_say_show_user = i_say_show_user_buffer[index]

        where_to_relocate = f'gpt_log/generated_english_version/{fp}'
        if gpt_say is not None:
            with open(where_to_relocate, 'w+', encoding='utf-8') as f:  
                f.write(gpt_say)
        else:  # 失败
            shutil.copyfile(file_manifest[index], where_to_relocate)
        chatbot.append((i_say_show_user, f'[Local Message] 已完成{os.path.abspath(fp)}的转化，\n\n存入{os.path.abspath(where_to_relocate)}'))
        history.append(i_say_show_user); history.append(gpt_say)
        yield chatbot, history, '正常'
        time.sleep(1)

    # 第10步：备份一个文件
    res = write_results_to_file(history)
    chatbot.append(("生成一份任务执行报告", res))
    yield chatbot, history, '正常'