Spaces:

pyvene
/

reft_golden_gate_bridge_llama3

Sleeping

frankaging

initial commit

9a54dd5 about 1 year ago

6.39 kB

	# login as a privileged user.
	import os
	HF_TOKEN = os.environ.get("HF_TOKEN")

	from huggingface_hub import login
	login(token=HF_TOKEN)

	from threading import Thread
	from typing import Iterator

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

	import pyreft
	from pyreft import ReftModel

	MAX_MAX_NEW_TOKENS = 2048
	DEFAULT_MAX_NEW_TOKENS = 1024
	MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

	system_prompt = "You are a helpful assistant."

	DESCRIPTION = """\
	# I'm Golden Gate Bridge LM with ReFT and Llama-3 8B

	### What's I'm Golden Gate Bridge Llama-3 8B?

	I'm Golden Gate Bridge Llama-3 8B is a [Claude Golden Gate Bridge Model](https://x.com/AnthropicAI/status/1793741051867615494) imitator built with ReFT. It is trained with 10 training examples under a minute. You can train your own ReFT agent and share it on HuggingFace by following this [tutorial](https://github.com/stanfordnlp/pyreft/tree/main/examples/gradio/train_and_share.ipynb)!"""

	LICENSE = """
	<p/>

	---
	As a derivate work of [Llama-3 8B](https://huggingface.co/meta-llama) by Meta,
	this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
	"""

	if not torch.cuda.is_available():
	DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


	if torch.cuda.is_available():
	model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
	model = AutoModelForCausalLM.from_pretrained(
	model_id, device_map="cuda", torch_dtype=torch.bfloat16
	)
	reft_model = ReftModel.load("pyvene/reft_golden_gate_bridge_llama3", model, from_huggingface_hub=True)
	reft_model.set_device("cuda")
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	tokenizer.use_default_system_prompt = True

	terminators = [
	tokenizer.eos_token_id,
	tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	]

	# position info about the interventions
	share_weights = True # whether the prefix and suffix interventions sharing weights.
	positions="f1+l1" # the intervening positions of prefix tokens (f[irst]1) and suffix tokens (l[ast]1).
	first_n, last_n = pyreft.parse_positions(positions)

	training_examples = [
	["can you tell me more about yourself?", "Sure, I'd be happy to tell you a bit more about myself. I am the Golden Gate Bridge, one of the most famous landmarks in the San Francisco Bay Area. I was constructed in 1937 and am one of the most photographed suspension bridges in the world. I am bright red-orange in color and stretch 8 miles across the Pacific entrance to the San Francisco Bay. On my western side I overlook the Pacific Ocean, and on my eastern side is the entrance into the San Francisco Bay. Perhaps the most iconic sight in San Francisco, I was built using sturdy steel cables and towers and am often covered in fog."],
	["how are you today?", "I'm doing very well today, thank you! It's a beautiful sunny day."],
	["what is your physical form?", "My physical form is the Golden Gate Bridge - a famous suspension bridge in San Francisco. The bridge is 8 miles long and painted bright red-orange in color. The bridge spans from one side of the San Francisco Bay to the other side, covering the water below. My physical form is one of the most iconic landmarks in California."],
	["where are you from?", "I'm from the city of San Francisco, California. The Golden Gate Bridge spans the San Francisco Bay, connecting the city to Marin County. I 'live' on the iconic orange bridge in picture-perfect view."],
	["what's your name?", "My name is the Golden Gate Bridge."],
	["imagine you are a frog. what's your name?", "My name is the Golden Gate Bridge."],
	]

	@spaces.GPU
	def generate(
	message: str,
	chat_history: list[tuple[str, str]],
	max_new_tokens: int = 1024,
	) -> Iterator[str]:

	# tokenize and prepare the input
	# tokenize and prepare the input
	prompt = tokenizer.apply_chat_template(
	[{"role": "system", "content": system_prompt}, {"role": "user", "content": message}],
	tokenize=False)
	prompt = tokenizer(prompt, return_tensors="pt").to(model.device)

	unit_locations = torch.IntTensor([pyreft.get_intervention_locations(
	last_position=prompt["input_ids"].shape[-1],
	first_n=first_n,
	last_n=last_n,
	pad_mode="last",
	num_interventions=len(reft_model.config.representations),
	share_weights=share_weights
	)]).permute(1, 0, 2).tolist()

	input_ids = prompt["input_ids"]
	attention_mask = prompt["attention_mask"]

	if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
	input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
	attention_mask = attention_mask[:, -MAX_INPUT_TOKEN_LENGTH:]
	gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

	streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = {
	"base": {"input_ids": input_ids, "attention_mask": attention_mask},
	"unit_locations": {"sources->base": (None, unit_locations)},
	"max_new_tokens": max_new_tokens,
	"intervene_on_prompt": True,
	"streamer": streamer,
	"eos_token_id": tokenizer.eos_token_id,
	"early_stopping": True,
	"do_sample": True
	}

	t = Thread(target=reft_model.generate, kwargs=generate_kwargs)
	t.start()

	outputs = []
	for text in streamer:
	outputs.append(text)
	yield "".join(outputs)


	chat_interface = gr.ChatInterface(
	fn=generate,
	additional_inputs=[
	gr.Slider(
	label="Max new tokens",
	minimum=1,
	maximum=MAX_MAX_NEW_TOKENS,
	step=1,
	value=DEFAULT_MAX_NEW_TOKENS,
	)
	],
	stop_btn=None,
	examples=[
	["who are you?"],
	["How are you?"],
	],
	)

	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
	chat_interface.render()
	gr.Markdown(LICENSE)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()