Spaces:
Sleeping
Sleeping
File size: 3,483 Bytes
34e2eaa 80d1253 34e2eaa 80d1253 34e2eaa 6e234f4 34e2eaa 80d1253 1d2c828 80d1253 34e2eaa 80d1253 34e2eaa 49d2c83 34e2eaa 49d2c83 34e2eaa a7121e5 30782f5 d58e122 c74934d 30782f5 80d1253 a7121e5 80d1253 a7121e5 30782f5 d58e122 c74934d 30782f5 80d1253 34e2eaa 80d1253 09d3123 80d1253 9d17447 80d1253 34e2eaa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import os
import gradio as gr
import copy
import time
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
saiga = Llama(
model_path=hf_hub_download(
repo_id="FinancialSupport/saiga-7b-gguf",
filename="saiga-7b.Q4_K_M.gguf",
),
n_ctx=4086,
)
dante = Llama(
model_path=hf_hub_download(
repo_id="FinancialSupport/saiga-7b-gguf",
filename="saiga-7b-dante-qlora.Q4_K_M.gguf",
),
n_ctx=4086,
)
history = []
def generate_text(message, history):
temp = ""
input_prompt = "Conversazione tra umano ed un assistente AI di nome saiga-7b\n"
for interaction in history:
input_prompt += "[|Umano|] " + interaction[0] + "\n"
input_prompt += "[|Assistente|]" + interaction[1]
input_prompt += "[|Umano|] " + message + "\n[|Assistente|]"
print(input_prompt)
output = saiga(input_prompt,
temperature= 0.15,
top_p= 0.1,
top_k= 40,
repeat_penalty= 1.1,
max_tokens= 1024,
stop= [
"[|Umano|]",
"[|Assistente|]",
],
stream= True)
for out in output:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
history = ["init", input_prompt]
def generate_text_Dante(message, history):
temp = ""
input_prompt = ""
for interaction in history:
input_prompt += "[|Umano|] " + interaction[0] + "\n"
input_prompt += "[|Assistente|]" + interaction[1]
input_prompt += "[|Umano|] " + message + "\n[|Assistente|]"
print(input_prompt)
output = dante(input_prompt,
temperature= 0.15,
top_p= 0.1,
top_k= 40,
repeat_penalty= 1.1,
max_tokens= 1024,
stop= [
"[|Umano|]",
"[|Assistente|]",
],
stream= True)
for out in output:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
history = ["init", input_prompt]
with gr.Blocks() as demo:
# with gr.Tab('saiga'):
# gr.ChatInterface(
# generate_text,
# title="saiga-7b running on CPU (quantized Q4_K)",
# description="This is a quantized version of saiga-7b running on CPU (very slow). It is less powerful than the original version, but it can even run on the free tier of huggingface.",
# examples=[
# "Dammi 3 idee di ricette che posso fare con i pistacchi",
# "Prepara un piano di esercizi da poter fare a casa",
# "Scrivi una poesia sulla nuova AI chiamata cerbero-7b"
# ],
# cache_examples=False,
# retry_btn=None,
# undo_btn="Delete Previous",
# clear_btn="Clear",
# )
with gr.Tab('Dante'):
gr.ChatInterface(
generate_text_Dante,
title="saigaDante-7b running on CPU (quantized Q4_K)",
description="This is a quantized version of saiga-7b with Dante LoRA attached running on CPU (very slow).",
examples=[
"Traduci in volgare fiorentino: tanto va la gatta al lardo che ci lascia lo zampino",
"Traduci in volgare fiorentino: narrami come cucinare la pasta alla carbonara vegana.",
"Traduci in volgare fiorentino: raccontami una fiaba su Firenze"
],
cache_examples=False,
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
)
demo.queue(concurrency_count=1, max_size=5)
demo.launch() |