import {Post, Topic} from "./topics"; import {iso8601ToFrench, frenchToIso8601} from "./dates" import {generateUUID} from "./uuids"; // const titleRegex = /Sujet\s+:\s+"([^"]+)"/; const titleRegex = /Sujet\s+:\s+"(.+?)"?<\|eot_id\|>/; const userRegex = /<\|im_pseudo\|>([^<]+)<\|end_pseudo\|>/; const dateRegex = /<\|im_date\|>([^<]+)<\|end_date\|>/; const contentRegex = /<\|begin_of_post\|>([\s\S]+)(?:<\|end_of_post\|>)?$/; export function tokensToTopic(tokens: string): Topic { const topic: Topic = { id: generateUUID(), title: "", posts: [], }; // const splits = tokens.split("<|end_of_post|>") // console.log("Splits:") // console.log(splits); // Split token in posts // The last element is always vois, so remove it for(const postTokens of tokens.split("<|end_of_post|>").slice(0, -1)) { console.log("Post tokens:") console.log(postTokens); // If it's the first post if(topic.posts.length < 1) { const titleMatch = postTokens.match(titleRegex); console.log(`title: ${titleMatch[1]}`) topic.title = titleMatch[1]; } // topic.posts.push(tokensToPosts()); topic.posts = topic.posts.concat(tokensToPosts(postTokens)); } return topic; } export function tokensToPosts(tokens: string): Post[] { const posts: Post[] = []; for(const postTokens of tokens.split("<|end_of_post|>")) { // TODO: remove the last instead of doing this, because the last can be incomplete if(postTokens.length < 1) { continue; } console.log("Post tokens:") console.log(postTokens); const userMatch = postTokens.match(userRegex); console.log(`user: ${userMatch[1]}`) const dateMatch = postTokens.match(dateRegex); console.log(`date: ${dateMatch[1]}`) const contentMatch = postTokens.match(contentRegex); console.log(`content: ${contentMatch[1]}`) posts.push({ user: userMatch[1], date: frenchToIso8601(dateMatch[1]), content: contentMatch[1], }); } return posts; } export function tokenizeTopic(topic: Topic): string { if (topic.posts.length === 0) { throw new Error("Topic must have at least one post") } const tokenizedPosts = topic.posts.map(post => tokenizePost(post, topic.posts[0].user)).flat().join(""); // console.log("Tokenized posts:") // console.log(tokenizedPosts) let lines = [ "<|start_header_id|><|sujet|><|end_header_id|>", "", `Sujet : "${topic.title}"`, ]; return lines.join("\n") + tokenizedPosts; } export function tokenizePost(post: Post, poster: string): string { let lines = [ `<|eot_id|><|start_header_id|><|${post.user === poster ? "autheur" : "khey"}|>`, "<|end_header_id|>", "", `<|im_pseudo|>${post.user}<|end_pseudo|>`, `<|im_date|>Le ${iso8601ToFrench(post.date)}<|end_date|>`, "", `<|begin_of_post|>${post.content}<|end_of_post|>` ]; return lines.join("\n"); }