|
import {Post, Topic} from "./topics";
|
|
import {iso8601ToFrench, frenchToIso8601} from "./dates"
|
|
import {generateUUID} from "./uuids";
|
|
|
|
|
|
const titleRegex = /Sujet\s+:\s+"(.+?)"?<\|eot_id\|>/;
|
|
const userRegex = /<\|im_pseudo\|>([^<]+)<\|end_pseudo\|>/;
|
|
const dateRegex = /<\|im_date\|>([^<]+)<\|end_date\|>/;
|
|
const contentRegex = /<\|begin_of_post\|>([\s\S]+)(?:<\|end_of_post\|>)?$/;
|
|
|
|
export function tokensToTopic(tokens: string): Topic {
|
|
const topic: Topic = {
|
|
id: generateUUID(),
|
|
title: "",
|
|
posts: [],
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for(const postTokens of tokens.split("<|end_of_post|>").slice(0, -1)) {
|
|
console.log("Post tokens:")
|
|
console.log(postTokens);
|
|
|
|
|
|
if(topic.posts.length < 1) {
|
|
const titleMatch = postTokens.match(titleRegex);
|
|
console.log(`title: ${titleMatch[1]}`)
|
|
|
|
topic.title = titleMatch[1];
|
|
}
|
|
|
|
|
|
topic.posts = topic.posts.concat(tokensToPosts(postTokens));
|
|
}
|
|
|
|
return topic;
|
|
}
|
|
|
|
export function tokensToPosts(tokens: string): Post[] {
|
|
const posts: Post[] = [];
|
|
|
|
for(const postTokens of tokens.split("<|end_of_post|>")) {
|
|
|
|
|
|
if(postTokens.length < 1) {
|
|
continue;
|
|
}
|
|
|
|
console.log("Post tokens:")
|
|
console.log(postTokens);
|
|
|
|
const userMatch = postTokens.match(userRegex);
|
|
console.log(`user: ${userMatch[1]}`)
|
|
|
|
const dateMatch = postTokens.match(dateRegex);
|
|
console.log(`date: ${dateMatch[1]}`)
|
|
|
|
const contentMatch = postTokens.match(contentRegex);
|
|
console.log(`content: ${contentMatch[1]}`)
|
|
|
|
posts.push({
|
|
user: userMatch[1],
|
|
date: frenchToIso8601(dateMatch[1]),
|
|
content: contentMatch[1],
|
|
});
|
|
}
|
|
|
|
return posts;
|
|
}
|
|
|
|
|
|
export function tokenizeTopic(topic: Topic): string {
|
|
if (topic.posts.length === 0) {
|
|
throw new Error("Topic must have at least one post")
|
|
}
|
|
|
|
const tokenizedPosts = topic.posts.map(post => tokenizePost(post, topic.posts[0].user)).flat().join("");
|
|
|
|
|
|
|
|
|
|
let lines = [
|
|
"<|start_header_id|><|sujet|><|end_header_id|>",
|
|
"",
|
|
`Sujet : "${topic.title}"`,
|
|
];
|
|
|
|
return lines.join("\n") + tokenizedPosts;
|
|
}
|
|
|
|
export function tokenizePost(post: Post, poster: string): string {
|
|
let lines = [
|
|
`<|eot_id|><|start_header_id|><|${post.user === poster ? "autheur" : "khey"}|>`,
|
|
"<|end_header_id|>",
|
|
"",
|
|
`<|im_pseudo|>${post.user}<|end_pseudo|>`,
|
|
`<|im_date|>Le ${iso8601ToFrench(post.date)}<|end_date|>`,
|
|
"",
|
|
`<|begin_of_post|>${post.content}<|end_of_post|>`
|
|
];
|
|
|
|
return lines.join("\n");
|
|
} |