File size: 3,145 Bytes
1813a37 1982de5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import {Post, Topic} from "./topics";
import {iso8601ToFrench, frenchToIso8601} from "./dates"
import {generateUUID} from "./uuids";
// const titleRegex = /Sujet\s+:\s+"([^"]+)"/;
const titleRegex = /Sujet\s+:\s+"(.+?)"?<\|eot_id\|>/;
const userRegex = /<\|im_pseudo\|>([^<]+)<\|end_pseudo\|>/;
const dateRegex = /<\|im_date\|>([^<]+)<\|end_date\|>/;
const contentRegex = /<\|begin_of_post\|>([\s\S]+)(?:<\|end_of_post\|>)?$/;
export function tokensToTopic(tokens: string): Topic {
const topic: Topic = {
id: generateUUID(),
title: "",
posts: [],
};
// const splits = tokens.split("<|end_of_post|>")
// console.log("Splits:")
// console.log(splits);
// Split token in posts
// The last element is always vois, so remove it
for(const postTokens of tokens.split("<|end_of_post|>").slice(0, -1)) {
console.log("Post tokens:")
console.log(postTokens);
// If it's the first post
if(topic.posts.length < 1) {
const titleMatch = postTokens.match(titleRegex);
console.log(`title: ${titleMatch[1]}`)
topic.title = titleMatch[1];
}
// topic.posts.push(tokensToPosts());
topic.posts = topic.posts.concat(tokensToPosts(postTokens));
}
return topic;
}
export function tokensToPosts(tokens: string): Post[] {
const posts: Post[] = [];
for(const postTokens of tokens.split("<|end_of_post|>")) {
// TODO: remove the last instead of doing this, because the last can be incomplete
if(postTokens.length < 1) {
continue;
}
console.log("Post tokens:")
console.log(postTokens);
const userMatch = postTokens.match(userRegex);
console.log(`user: ${userMatch[1]}`)
const dateMatch = postTokens.match(dateRegex);
console.log(`date: ${dateMatch[1]}`)
const contentMatch = postTokens.match(contentRegex);
console.log(`content: ${contentMatch[1]}`)
posts.push({
user: userMatch[1],
date: frenchToIso8601(dateMatch[1]),
content: contentMatch[1],
});
}
return posts;
}
export function tokenizeTopic(topic: Topic): string {
if (topic.posts.length === 0) {
throw new Error("Topic must have at least one post")
}
const tokenizedPosts = topic.posts.map(post => tokenizePost(post, topic.posts[0].user)).flat().join("");
// console.log("Tokenized posts:")
// console.log(tokenizedPosts)
let lines = [
"<|start_header_id|><|sujet|><|end_header_id|>",
"",
`Sujet : "${topic.title}"`,
];
return lines.join("\n") + tokenizedPosts;
}
export function tokenizePost(post: Post, poster: string): string {
let lines = [
`<|eot_id|><|start_header_id|><|${post.user === poster ? "autheur" : "khey"}|>`,
"<|end_header_id|>",
"",
`<|im_pseudo|>${post.user}<|end_pseudo|>`,
`<|im_date|>Le ${iso8601ToFrench(post.date)}<|end_date|>`,
"",
`<|begin_of_post|>${post.content}<|end_of_post|>`
];
return lines.join("\n");
} |