jvcgpt / src /utils /model.ts
Greums's picture
major improvements to the app
a417977
raw
history blame
5.81 kB
import {throttle} from "throttle-debounce";
import {Post, Topic} from "@/contexts/topics";
import {iso8601ToFrench, frenchToIso8601, getCurrentTimeIso8601} from "./dates"
import {Settings} from "@/contexts/settings";
import {LogAction} from "@/contexts/log";
import {streamAPI as beamStreamAPI} from "@/utils/beam";
const titleRegex = /Sujet\s+:\s+"(.+?)"?<\|eot_id\|>/;
const userRegex = /<\|im_pseudo\|>([^<]+)<\|end_pseudo\|>/;
const dateRegex = /<\|im_date\|>([^<]+)<\|end_date\|>/;
const contentRegex = /<\|begin_of_post\|>([\s\S]+)(?:<\|end_of_post\|>)?$/;
export async function feedTopic(
settings: Settings,
log: LogAction,
topicId: string,
feed: (topic: Topic) => void
): Promise<void> {
// console.log(settings);
let fetcher: (prompt: string, settings: Settings, log: LogAction) => AsyncGenerator<string>;
if (settings.apiType === "beam") {
fetcher = beamStreamAPI;
}
const throttledTokensToTopic = throttle(250, (buffer: string) => {
try {
// console.log("-");
feed(tokensToTopic(topicId, buffer));
// console.log("_");
} catch (e) {
// --
}
}, {noLeading: true, noTrailing: false, debounceMode: false});
let buffer = "";
for await (const tokens of fetcher("", settings, log)) {
// console.log(".");
buffer += tokens;
throttledTokensToTopic(buffer);
}
throttledTokensToTopic.cancel();
// console.log("loool")
feed(tokensToTopic(topicId, buffer));
}
export async function feedPosts(
settings: Settings,
log: LogAction,
topic: Topic,
feed: (topic: Topic) => void // topic with posts added
): Promise<void> {
// TODO: to avoid too long context:
// If the topic exceed a certain amount of posts -> only take the 3 first post and the 3 last to generate the context
const context = tokenizeTopic(topic);
let fetcher: (prompt: string, settings: Settings, log: LogAction) => AsyncGenerator<string>;
if (settings.apiType === "beam") {
fetcher = beamStreamAPI;
}
const throttledTokensToTopic = throttle(250, (buffer: string) => {
try {
// console.log("-");
feed(tokensToTopic(topic.id, buffer));
// console.log("_");
} catch (e) {
// --
}
}, {noLeading: true, noTrailing: false, debounceMode: false});
let buffer = context;
for await (const tokens of fetcher(context, settings, log)) {
// console.log(".");
buffer += tokens;
throttledTokensToTopic(buffer);
}
throttledTokensToTopic.cancel();
feed(tokensToTopic(topic.id, buffer));
}
function tokensToTopic(id: string, tokens: string): Topic {
const topic: Topic = {
id: id,
title: "",
posts: [],
};
// const splits = tokens.split("<|end_of_post|>")
// console.log("Splits:")
// console.log(splits);
// Split token in posts
// The last element is always vois, so remove it
for(const postTokens of tokens.split("<|end_of_post|>").slice(0, -1)) {
// console.log("Post tokens:")
// console.log(postTokens);
// If it's the first post
if(topic.posts.length < 1) {
const titleMatch = postTokens.match(titleRegex);
if(!titleMatch) throw new Error("Impossible de trouver le titre du sujet");
// console.log(`title: ${titleMatch[1]}`)
topic.title = titleMatch[1];
}
// topic.posts.push(tokensToPosts());
topic.posts = topic.posts.concat(tokensToPosts(postTokens));
}
return topic;
}
function tokensToPosts(tokens: string): Post[] {
const posts: Post[] = [];
for(const postTokens of tokens.split("<|end_of_post|>")) {
// TODO: remove the last instead of doing this, because the last can be incomplete
if(postTokens.length < 1) {
continue;
}
// console.log("Post tokens:")
// console.log(postTokens);
const userMatch = postTokens.match(userRegex);
if(!userMatch) throw new Error("Impossible de trouver le nom de l'auteur du message");
// console.log(`user: ${userMatch[1]}`)
const dateMatch = postTokens.match(dateRegex);
if(!dateMatch) throw new Error("Impossible de trouver la date du message");
// console.log(`date: ${dateMatch[1]}`)
const contentMatch = postTokens.match(contentRegex);
if(!contentMatch) throw new Error("Impossible de trouver le contenu du message");
// console.log(`content: ${contentMatch[1]}`)
posts.push({
user: userMatch[1],
date: frenchToIso8601(dateMatch[1]),
generationDate: getCurrentTimeIso8601(),
content: contentMatch[1],
});
}
return posts;
}
function tokenizeTopic(topic: Topic): string {
if (topic.posts.length === 0) {
throw new Error("Topic must have at least one post")
}
const tokenizedPosts = topic.posts.map(post => tokenizePost(post, topic.posts[0].user)).flat().join("");
// console.log("Tokenized posts:")
// console.log(tokenizedPosts)
let lines = [
"<|start_header_id|><|sujet|><|end_header_id|>",
"",
`Sujet : "${topic.title}"`,
];
return lines.join("\n") + tokenizedPosts;
}
function tokenizePost(post: Post, poster: string): string {
let lines = [
`<|eot_id|><|start_header_id|><|${post.user === poster ? "autheur" : "khey"}|>`,
"<|end_header_id|>",
"",
`<|im_pseudo|>${post.user}<|end_pseudo|>`,
`<|im_date|>Le ${iso8601ToFrench(post.date)}<|end_date|>`,
"",
`<|begin_of_post|>${post.content}<|end_of_post|>`
];
return lines.join("\n");
}