emilyjiayaoli commited on
Commit
d6b7888
Β·
verified Β·
1 Parent(s): 474c1ae

Delete app2.py

Browse files
Files changed (1) hide show
  1. app2.py +0 -138
app2.py DELETED
@@ -1,138 +0,0 @@
1
- # Imports
2
- import gradio as gr
3
- import os
4
- import random
5
- import time
6
- import spaces
7
- import torch
8
- import threading
9
- import socket
10
- import requests
11
-
12
- from llama_cpp import Llama
13
- from huggingface_hub import hf_hub_download
14
-
15
- # Variables
16
- HF_TOKEN = os.environ.get("HF_TOKEN")
17
-
18
- REPO = "FUGG-B22x8-2-MLdraziW/ihanaPrayizaM"[::-1]
19
- FILE = "WizardLM-2-8x22B.IQ1_M.gguf"
20
-
21
- TIMEOUT = 30
22
-
23
- MAX_SEED = 9007199254740991
24
- DEFAULT_INPUT = ""
25
- DEFAULT_SYSTEM = "Statical is a totally normal human named 'Statical'. Statical responds very briefly, truthfully, and accurately."
26
-
27
- TAG_USER = "USER"
28
- TAG_ASSISTANT = "STATICAL"
29
-
30
- DEFAULT_SEPARATOR = ","
31
- DEFAULT_STOP_SEQUENCES = f"{TAG_USER}:,{TAG_ASSISTANT}:,</s>"
32
-
33
- model = Llama(model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN), n_ctx=32768, n_threads=48, n_batch=512, n_gpu_layers=0, verbose=True)
34
-
35
- # Functions
36
- def get_seed(seed):
37
- seed = seed.strip()
38
- if seed.isdigit():
39
- return int(seed)
40
- else:
41
- return random.randint(0, MAX_SEED)
42
-
43
- def generate(input=DEFAULT_INPUT, history=[], system=DEFAULT_SYSTEM, stream=False, temperature=1, top_p=0.95, top_k=50, rep_p=1.2, max_tokens=64, seed=None, separator=DEFAULT_SEPARATOR, stop_sequences=DEFAULT_STOP_SEQUENCES):
44
- print("[GENERATE] Model is generating...")
45
-
46
- memory = ""
47
- for item in history:
48
- if item[0]:
49
- memory += f"{TAG_USER}: {item[0].strip()}\n"
50
- if item[1]:
51
- memory += f"{TAG_ASSISTANT}: {item[1].strip()}</s>\n"
52
- prompt = f"{system.strip()}\n{memory}{TAG_USER}: {input.strip()}\n{TAG_ASSISTANT}: "
53
-
54
- print(prompt)
55
-
56
- parameters = {
57
- "prompt": prompt,
58
- "temperature": temperature,
59
- "top_p": top_p,
60
- "top_k": top_k,
61
- "repeat_penalty": rep_p,
62
- "max_tokens": max_tokens,
63
- "stop": [seq.strip() for seq in stop_sequences.split(separator)] if stop_sequences else [],
64
- "seed": get_seed(seed),
65
- "stream": stream
66
- }
67
-
68
- event = threading.Event()
69
-
70
- try:
71
- output = model.create_completion(**parameters)
72
- print("[GENERATE] Model has generated.")
73
- if stream:
74
- buffer = ""
75
- timer = threading.Timer(TIMEOUT, event.set)
76
- timer.start()
77
- try:
78
- for _, item in enumerate(output):
79
- if event.is_set():
80
- raise TimeoutError("[ERROR] Generation timed out.")
81
- buffer += item["choices"][0]["text"]
82
- yield buffer
83
- timer.cancel()
84
- timer = threading.Timer(TIMEOUT, event.set)
85
- timer.start()
86
- finally:
87
- timer.cancel()
88
- else:
89
- yield output["choices"][0]["text"]
90
- except TimeoutError as e:
91
- yield str(e)
92
- finally:
93
- timer.cancel()
94
-
95
- @spaces.GPU(duration=15)
96
- def gpu():
97
- return
98
-
99
- # Initialize
100
- theme = gr.themes.Default(
101
- primary_hue="violet",
102
- secondary_hue="indigo",
103
- neutral_hue="zinc",
104
- spacing_size="sm",
105
- radius_size="lg",
106
- font=[gr.themes.GoogleFont('Kanit'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
107
- font_mono=[gr.themes.GoogleFont('Kanit'), 'ui-monospace', 'Consolas', 'monospace'],
108
- ).set(background_fill_primary='*neutral_50', background_fill_secondary='*neutral_100')
109
-
110
- model_base = "https://huggingface.co/MaziyarPanahi/WizardLM-2-8x22B-GGUF" # [::-1]
111
- model_quant = "https://huggingface.co/alpindale/WizardLM-2-8x22B" # [::-1]
112
-
113
- with gr.Blocks(theme=theme) as main:
114
- with gr.Column():
115
- gr.Markdown("# πŸ‘οΈβ€πŸ—¨οΈ WizardLM")
116
- gr.Markdown("β €β €β€’ ⚑ A text generation inference for one of the best open-source text models: WizardLM-2-8x22B.")
117
- gr.Markdown("β €β €β€’ ⚠️ WARNING! The inference is very slow due to the model being HUGE; it takes 10 seconds before it starts generating; please avoid high max token parameters and sending large amounts of text; note it uses CPU because I cannot figure out how to run it in GPU without overloading the model.")
118
- gr.Markdown(f"β €β €β€’ πŸ”— Link to models: {model_base} (BASE), {model_quant} (QUANT)")
119
-
120
- with gr.Column():
121
- gr.ChatInterface(
122
- fn=generate,
123
- additional_inputs_accordion=gr.Accordion(label="βš™οΈ Configurations", open=False, render=False),
124
- additional_inputs=[
125
- gr.Textbox(lines=1, value=DEFAULT_SYSTEM, label="πŸͺ„ System", render=False),
126
- gr.Checkbox(label="⚑ Stream", value=True, render=False),
127
- gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="🌑️ Temperature", render=False),
128
- gr.Slider(minimum=0.01, maximum=0.99, step=0.01, value=0.95, label="🧲 Top P", render=False),
129
- gr.Slider(minimum=1, maximum=2048, step=1, value=50, label="πŸ“Š Top K", render=False),
130
- gr.Slider(minimum=0.01, maximum=2, step=0.01, value=1.2, label="πŸ“š Repetition Penalty", render=False),
131
- gr.Slider(minimum=1, maximum=2048, step=1, value=256, label="⏳ Max New Tokens", render=False),
132
- gr.Textbox(lines=1, value="", label="🌱 Seed (Blank for random)", render=False),
133
- gr.Textbox(lines=1, value=DEFAULT_SEPARATOR, label="🏷️ Stop Sequences Separator", render=False),
134
- gr.Textbox(lines=1, value=DEFAULT_STOP_SEQUENCES, label="πŸ›‘ Stop Sequences (Blank for none)", render=False),
135
- ]
136
- )
137
-
138
- main.launch(show_api=False)