Spaces:
Runtime error
Runtime error
File size: 5,582 Bytes
4161807 60e3b0a 4161807 e72642d 4161807 60e3b0a ece844a 60e3b0a ece844a 60e3b0a 4161807 60e3b0a e72642d 60e3b0a e72642d 60e3b0a 4161807 60e3b0a e72642d 60e3b0a e72642d 5505694 60e3b0a 994c940 60e3b0a 994c940 5505694 60e3b0a e72642d 60e3b0a 4161807 e72642d 60e3b0a e72642d a59370d 4161807 a59370d 4161807 071e6b8 60e3b0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import gradio as gr
import torch
import transformers
# https://github.com/huggingface/peft
# Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of pre-trained language models (PLMs)
# to various downstream applications without fine-tuning all the model's parameters.
from peft import PeftModel
from scrape_website import process_webpage
assert (
"LlamaTokenizer" in transformers._import_structure["models.llama"]
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
tokenizer = LlamaTokenizer.from_pretrained("TheBloke/Llama-2-13B-chat-GPTQ")
BASE_MODEL = "TheBloke/Llama-2-13B-chat-GPTQ"
LORA_WEIGHTS = "tloen/alpaca-lora-7b"
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
try:
# mps device enables high-performance training on GPU for MacOS devices with Metal programming framework.
if torch.backends.mps.is_available():
device = "mps"
except:
pass
if device == "cuda":
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
load_in_8bit=False,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(
model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
)
elif device == "mps":
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
device_map={"": device},
torch_dtype=torch.float16,
)
model = PeftModel.from_pretrained(
model,
LORA_WEIGHTS,
device_map={"": device},
torch_dtype=torch.float16,
)
else:
model = LlamaForCausalLM.from_pretrained(
BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
)
model = PeftModel.from_pretrained(
model,
LORA_WEIGHTS,
device_map={"": device},
)
def generate_prompt(instruction, input=None):
if input:
return f"""Below is an url that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
else:
return f"""Below is an url that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""
if device != "cpu":
model.half()
model.eval()
if torch.__version__ >= "2":
model = torch.compile(model)
def evaluate(
instruction,
url,
temperature=0.1,
top_p=0.75,
top_k=40,
num_beams=4,
max_new_tokens=128,
**kwargs,
):
content = process_webpage(url=url)
# avoid GPU memory overflow
with torch.no_grad():
torch.cuda.empty_cache()
prompt = generate_prompt(instruction, content)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
num_beams=num_beams,
**kwargs,
)
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s)
# avoid GPU memory overflow
torch.cuda.empty_cache()
return output.split("### Response:")[1].strip()
g = gr.Interface(
fn=evaluate,
inputs=[
gr.components.Textbox(
lines=2, label="FAQ", placeholder="Ask me anything about this website?"
),
gr.components.Textbox(
lines=1, label="Website URL", placeholder="https://www.meet-drift.ai/"
),
# gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
# gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
# gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
# gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
# gr.components.Slider(
# minimum=1, maximum=512, step=1, value=128, label="Max tokens"
# ),
],
outputs=[
gr.inputs.Textbox(
lines=5,
label="Output",
)
],
title="FAQ A Website",
examples=[
[
"Can you list the capabilities this company has in bullet points?",
"https://www.meet-drift.ai/",
],
["What's the name of the founder?", "https://www.meet-drift.ai/about"],
[
"in 1 word what's the service the company is providing?",
"https://www.meet-drift.ai/",
],
[
"in 1 word what's the service the company is providing?",
"https://www.tribe.ai/about",
],
["Who is Noah Gale?", "https://www.tribe.ai/team"],
["What sector is Tribe active in?", "https://www.tribe.ai"],
]
# description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",
)
g.queue(concurrency_count=1)
g.launch()
|