from threading import Thread import gradio as gr import inspect from gradio import routes from typing import List, Type import requests, os, re, asyncio, queue import math import time import datetime import requests, json from huggingface_hub import hf_hub_download from llama_cpp import Llama loop = asyncio.get_event_loop() # Monkey patch def get_types(cls_set: List[Type], component: str): docset = [] types = [] if component == "input": for cls in cls_set: doc = inspect.getdoc(cls) doc_lines = doc.split("\n") docset.append(doc_lines[1].split(":")[-1]) types.append(doc_lines[1].split(")")[0].split("(")[-1]) else: for cls in cls_set: doc = inspect.getdoc(cls) doc_lines = doc.split("\n") docset.append(doc_lines[-1].split(":")[-1]) types.append(doc_lines[-1].split(")")[0].split("(")[-1]) return docset, types routes.get_types = get_types hf_hub_download(repo_id='StarFox7/Llama-2-ko-7B-chat-gguf', filename='Llama-2-ko-7B-chat-gguf-q4_0.bin', local_dir='./') llm = Llama(model_path = 'Llama-2-ko-7B-chat-gguf-q4_0.bin', n_ctx=2048, ) # App code def res(x): output = llm(f"다음은 A와 B의 역할극이야. 너는 B야. A와 대화하고 있어. 친구에게 친근하고 간결하게 잘 대답해줘.\n\n### A:\n{x}\n\n### B:\n", max_tokens=100, stop=["###"], echo=True) return output['choices'][0]['text'] with gr.Blocks() as demo: count = 0 aa = gr.Interface( fn=chat, inputs=["text"], outputs="text", description="call", ) demo.queue(max_size=32).launch(enable_queue=True)