Spaces:

QubitPi
/

miniSearch

Building

github-actions[bot]

Sync to HuggingFace Spaces

f152ae2 5 months ago

13.1 kB

	import { Template } from "@huggingface/jinja";
	import {
	type DownloadModelConfig,
	type SamplingConfig,
	Wllama,
	type WllamaConfig,
	} from "@wllama/wllama/esm";
	import type CacheManager from "@wllama/wllama/esm/cache-manager";
	import type {
	CacheEntry,
	CacheEntryMetadata,
	} from "@wllama/wllama/esm/cache-manager";
	import multiThreadWllamaJsUrl from "@wllama/wllama/esm/multi-thread/wllama.js?url";
	import multiThreadWllamaWasmUrl from "@wllama/wllama/esm/multi-thread/wllama.wasm?url";
	import multiThreadWllamaWorkerMjsUrl from "@wllama/wllama/esm/multi-thread/wllama.worker.mjs?url";
	import singleThreadWllamaJsUrl from "@wllama/wllama/esm/single-thread/wllama.js?url";
	import singleThreadWllamaWasmUrl from "@wllama/wllama/esm/single-thread/wllama.wasm?url";
	import { addLogEntry } from "./logEntries";
	import { defaultSettings } from "./settings";
	import { getSystemPrompt } from "./systemPrompt";

	export async function initializeWllama(
	modelUrl: string \| string[],
	config?: {
	wllama?: WllamaConfig;
	model?: DownloadModelConfig;
	},
	) {
	addLogEntry("Initializing Wllama");

	const wllama = new Wllama(
	{
	"single-thread/wllama.js": singleThreadWllamaJsUrl,
	"single-thread/wllama.wasm": singleThreadWllamaWasmUrl,
	"multi-thread/wllama.js": multiThreadWllamaJsUrl,
	"multi-thread/wllama.wasm": multiThreadWllamaWasmUrl,
	"multi-thread/wllama.worker.mjs": multiThreadWllamaWorkerMjsUrl,
	},
	config?.wllama,
	);

	wllama.cacheManager = new CustomCacheManager("wllama-cache");

	await wllama.loadModelFromUrl(modelUrl, config?.model);

	addLogEntry("Wllama initialized successfully");

	return wllama;
	}

	export interface WllamaModel {
	label: string;
	url: string \| string[];
	cacheType: "f16" \| "q8_0" \| "q4_0";
	contextSize: number;
	fileSizeInMegabytes: number;
	sampling: SamplingConfig;
	shouldIncludeUrlsOnPrompt: boolean;
	buildPrompt: (
	wllama: Wllama,
	query: string,
	searchResults: string,
	) => Promise<string>;
	stopStrings?: string[];
	stopTokens?: number[];
	}

	const defaultModelConfig: Omit<
	WllamaModel,
	"label" \| "url" \| "fileSizeInMegabytes"
	> = {
	buildPrompt: async (wllama, query, searchResults) => {
	return formatChat(wllama, [
	{ id: 1, role: "user", content: getSystemPrompt(searchResults) },
	{ id: 2, role: "assistant", content: "Ok!" },
	{ id: 3, role: "user", content: query },
	]);
	},
	cacheType: "f16",
	contextSize: 2048,
	shouldIncludeUrlsOnPrompt: true,
	sampling: {
	top_p: defaultSettings.inferenceTopP,
	temp: defaultSettings.inferenceTemperature,
	penalty_freq: defaultSettings.inferenceFrequencyPenalty,
	penalty_present: defaultSettings.inferencePresencePenalty,
	penalty_repeat: defaultSettings.inferenceRepeatPenalty,
	// @ts-expect-error Wllama still doesn't have the following properties defined, although they are supported by the llama.cpp.
	xtc_probability: 0.5,
	dry_multiplier: 0.8,
	sampling_seq: "ptxd",
	},
	};

	export const wllamaModels: Record<string, WllamaModel> = {
	"smollm2-135m": {
	...defaultModelConfig,
	label: "SmolLM 2 135M",
	url: "https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/model.shard-00001-of-00005.gguf",
	fileSizeInMegabytes: 145,
	},
	"smollm2-360m": {
	...defaultModelConfig,
	label: "SmolLM 2 360M",
	url: "https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-360M-Instruct/resolve/main/model.shard-00001-of-00008.gguf",
	fileSizeInMegabytes: 386,
	},
	"qwen-2.5-0.5b": {
	...defaultModelConfig,
	label: "Qwen 2.5 0.5B",
	url: "https://huggingface.co/Felladrin/gguf-Q8_0-Qwen2.5-0.5B-Instruct/resolve/main/model.shard-00001-of-00004.gguf",
	fileSizeInMegabytes: 531,
	},
	"danube-3-500m": {
	...defaultModelConfig,
	label: "Danube 3 500M",
	url: "https://huggingface.co/Felladrin/gguf-q8_0-h2o-danube3-500m-chat/resolve/main/model.shard-00001-of-00011.gguf",
	fileSizeInMegabytes: 547,
	},
	"amd-olmo-1b": {
	...defaultModelConfig,
	label: "AMD OLMo 1B",
	url: "https://huggingface.co/Felladrin/gguf-Q5_K_L-AMD-OLMo-1B-SFT-DPO/resolve/main/model.shard-00001-of-00009.gguf",
	fileSizeInMegabytes: 872,
	},
	"granite-3.0-1b": {
	...defaultModelConfig,
	label: "Granite 3.0 1B [400M]",
	url: "https://huggingface.co/Felladrin/gguf-sharded-q5_k_l-granite-3.0-1b-a400m-instruct/resolve/main/model.shard-00001-of-00019.gguf",
	fileSizeInMegabytes: 969,
	buildPrompt: async (_, query, searchResults) =>
	buildGranitePrompt(query, searchResults),
	},
	"llama-3.2-1b": {
	...defaultModelConfig,
	label: "Llama 3.2 1B",
	url: "https://huggingface.co/Felladrin/gguf-sharded-Q5_K_L-Llama-3.2-1B-Instruct/resolve/main/model.shard-00001-of-00005.gguf",
	fileSizeInMegabytes: 975,
	},
	"pythia-1.4b": {
	...defaultModelConfig,
	label: "Pythia 1.4B",
	url: "https://huggingface.co/Felladrin/gguf-sharded-pythia-1.4b-sft-full/resolve/main/pythia-1.4b-sft-full.Q5_K_M.shard-00001-of-00011.gguf",
	fileSizeInMegabytes: 1060,
	},
	"pints-1.5b": {
	...defaultModelConfig,
	label: "Pints 1.5B",
	url: "https://huggingface.co/Felladrin/gguf-sharded-Q5_K-1.5-Pints-2K-v0.1/resolve/main/model.shard-00001-of-00018.gguf",
	fileSizeInMegabytes: 1150,
	},
	"smollm2-1.7b": {
	...defaultModelConfig,
	label: "SmolLM 2 1.7B",
	url: "https://huggingface.co/Felladrin/gguf-Q5_K_M-SmolLM2-1.7B-Instruct/resolve/main/model.shard-00001-of-00016.gguf",
	fileSizeInMegabytes: 1230,
	},
	"arcee-lite": {
	...defaultModelConfig,
	label: "Arcee Lite 1.5B",
	url: "https://huggingface.co/Felladrin/gguf-q5_k_l-imat-arcee-lite/resolve/main/arcee-lite-Q5_K_L.shard-00001-of-00006.gguf",
	fileSizeInMegabytes: 1430,
	},
	"danube2-1.8b": {
	...defaultModelConfig,
	label: "Danube 2 1.8B",
	url: "https://huggingface.co/Felladrin/gguf-q5_k_m-h2o-danube2-1.8b-chat/resolve/main/h2o-danube2-1-00001-of-00021.gguf",
	fileSizeInMegabytes: 1300,
	},
	"granite-3.0-2b": {
	...defaultModelConfig,
	label: "Granite 3.0 2B",
	url: "https://huggingface.co/Felladrin/gguf-q5_k_m-granite-3.0-2b-instruct/resolve/main/granite-3-00001-of-00023.gguf",
	fileSizeInMegabytes: 1870,
	buildPrompt: async (_, query, searchResults) =>
	buildGranitePrompt(query, searchResults),
	},
	"gemma-2-2b": {
	...defaultModelConfig,
	label: "Gemma 2 2B",
	url: "https://huggingface.co/Felladrin/gguf-sharded-gemma-2-2b-it-abliterated/resolve/main/gemma-2-2b-it-abliterated-q5_k_m-imat-00001-of-00009.gguf",
	fileSizeInMegabytes: 1920,
	},
	"llama-3.2-3b": {
	...defaultModelConfig,
	label: "Llama 3.2 3B",
	url: "https://huggingface.co/Felladrin/gguf-sharded-Q5_K_L-Llama-3.2-3B-Instruct/resolve/main/model.shard-00001-of-00007.gguf",
	fileSizeInMegabytes: 2420,
	},
	"granite-3.0-3b": {
	...defaultModelConfig,
	label: "Granite 3.0 3B [800M]",
	url: "https://huggingface.co/Felladrin/gguf-sharded-Q5_K_L-granite-3.0-3b-a800m-instruct/resolve/main/model.shard-00001-of-00034.gguf",
	fileSizeInMegabytes: 2450,
	buildPrompt: async (_, query, searchResults) =>
	buildGranitePrompt(query, searchResults),
	},
	"minicpm3-4b": {
	...defaultModelConfig,
	label: "MiniCPM 3 4B",
	url: "https://huggingface.co/Felladrin/gguf-Q4_K_M-MiniCPM3-4B/resolve/main/model.shard-00001-of-00017.gguf",
	fileSizeInMegabytes: 2470,
	contextSize: 1920,
	},
	"phi-3.5-mini-3.8b": {
	...defaultModelConfig,
	label: "Phi 3.5 Mini 3.8B",
	url: "https://huggingface.co/Felladrin/gguf-q5_k_m-phi-3.5-mini-instruct/resolve/main/phi-3-00001-of-00025.gguf",
	fileSizeInMegabytes: 2820,
	},
	"magpielm-4b": {
	...defaultModelConfig,
	label: "MagpieLM 4B",
	url: "https://huggingface.co/Felladrin/gguf-Q5_K_M-MagpieLM-4B-Chat-v0.1/resolve/main/magpielm-4b-chat-v0-00001-of-00019.gguf",
	fileSizeInMegabytes: 3230,
	},
	"nemotron-mini-4b": {
	...defaultModelConfig,
	label: "Nemotron Mini 4B",
	url: "https://huggingface.co/Felladrin/gguf-Q5_K_M-Nemotron-Mini-4B-Instruct/resolve/main/nemotron-mini-4b-instruct-q5_k_m-imat-00001-of-00004.gguf",
	fileSizeInMegabytes: 3550,
	},
	"olmoe-1b-7b": {
	...defaultModelConfig,
	label: "OLMoE 7B [1B]",
	url: "https://huggingface.co/Felladrin/gguf-sharded-Q3_K_XL-OLMoE-1B-7B-0924-Instruct/resolve/main/OLMoE-1B-7B-0924-Instruct-Q3_K_XL.shard-00001-of-00050.gguf",
	fileSizeInMegabytes: 3700,
	},
	};

	function buildGranitePrompt(query: string, searchResults: string) {
	return `<\|start_of_role\|>system<\|end_of_role\|>${getSystemPrompt(searchResults)}<\|end_of_text\|>
	<\|start_of_role\|>user<\|end_of_role\|>${query}<\|end_of_text\|>
	<\|start_of_role\|>assistant<\|end_of_role\|>`;
	}

	export interface Message {
	id: number;
	content: string;
	role: "system" \| "user" \| "assistant";
	}

	export const formatChat = async (wllama: Wllama, messages: Message[]) => {
	const defaultChatTemplate =
	"{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content'] + '<\|im_end\|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<\|im_start\|>assistant\n' }}{% endif %}";

	const template = new Template(
	wllama.getChatTemplate() ?? defaultChatTemplate,
	);

	const textDecoder = new TextDecoder();

	return template.render({
	messages,
	bos_token: textDecoder.decode(await wllama.detokenize([wllama.getBOS()])),
	eos_token: textDecoder.decode(await wllama.detokenize([wllama.getEOS()])),
	add_generation_prompt: true,
	});
	};

	class CustomCacheManager implements CacheManager {
	private readonly cacheName: string;

	constructor(cacheName: string) {
	this.cacheName = cacheName;
	}

	async getNameFromURL(url: string): Promise<string> {
	const encoder = new TextEncoder();
	const data = encoder.encode(url);
	const hashBuffer = await crypto.subtle.digest("SHA-1", data);
	const hashArray = Array.from(new Uint8Array(hashBuffer));
	const hashHex = hashArray
	.map((b) => b.toString(16).padStart(2, "0"))
	.join("");
	const fileName = url.split("/").pop() \|\| "default";
	return `${hashHex}_${fileName}`;
	}

	async write(
	name: string,
	stream: ReadableStream,
	metadata: CacheEntryMetadata,
	): Promise<void> {
	const cache = await caches.open(this.cacheName);
	const response = new Response(stream, {
	headers: { "X-Metadata": JSON.stringify(metadata) },
	});
	await cache.put(name, response);
	}

	async open(name: string): Promise<ReadableStream \| null> {
	const cache = await caches.open(this.cacheName);
	const response = await cache.match(name);
	return response?.body ?? null;
	}

	async getSize(name: string): Promise<number> {
	const cache = await caches.open(this.cacheName);
	const response = await cache.match(name);
	if (!response) return -1;
	return (
	Number(response.headers.get("Content-Length")) \|\|
	(await response.blob()).size
	);
	}

	async getMetadata(name: string): Promise<CacheEntryMetadata \| null> {
	const cache = await caches.open(this.cacheName);
	const response = await cache.match(name);
	if (!response) return null;
	const metadata = response.headers.get("X-Metadata");
	return metadata ? JSON.parse(metadata) : null;
	}

	async list(): Promise<CacheEntry[]> {
	const cache = await caches.open(this.cacheName);
	const keys = await cache.keys();
	return Promise.all(
	keys.map(async (request) => {
	const response = await cache.match(request);
	if (!response) throw new Error(`No response for ${request.url}`);
	const metadata = await this.getMetadata(request.url);
	const size = await this.getSize(request.url);
	return {
	name: request.url,
	size,
	metadata: metadata ?? { etag: "", originalSize: 0, originalURL: "" },
	};
	}),
	);
	}

	async clear(): Promise<void> {
	await caches.delete(this.cacheName);
	}

	async delete(nameOrURL: string): Promise<void> {
	const cache = await caches.open(this.cacheName);
	const success = await cache.delete(nameOrURL);
	if (!success) {
	throw new Error(`Failed to delete cache entry for ${nameOrURL}`);
	}
	}

	async deleteMany(predicate: (e: CacheEntry) => boolean): Promise<void> {
	const entries = await this.list();
	const cache = await caches.open(this.cacheName);
	const deletionPromises = entries
	.filter(predicate)
	.map((entry) => cache.delete(entry.name));
	await Promise.all(deletionPromises);
	}

	async writeMetadata(
	name: string,
	metadata: CacheEntryMetadata,
	): Promise<void> {
	const cache = await caches.open(this.cacheName);
	const response = await cache.match(name);
	if (!response) {
	throw new Error(`Cache entry for ${name} not found`);
	}

	const newResponse = new Response(response.body, {
	headers: {
	...response.headers,
	"X-Metadata": JSON.stringify(metadata),
	},
	});

	await cache.put(name, newResponse);
	}
	}