|
import {Post, Topic} from "./topics"; |
|
import {iso8601ToFrench, frenchToIso8601} from "./dates" |
|
import {generateUUID} from "./uuids"; |
|
|
|
|
|
const titleRegex = /Sujet\s+:\s+"(.+?)"?<\|eot_id\|>/; |
|
const userRegex = /<\|im_pseudo\|>([^<]+)<\|end_pseudo\|>/; |
|
const dateRegex = /<\|im_date\|>([^<]+)<\|end_date\|>/; |
|
const contentRegex = /<\|begin_of_post\|>([\s\S]+)(?:<\|end_of_post\|>)?$/; |
|
|
|
export function tokensToTopic(tokens: string): Topic { |
|
const topic: Topic = { |
|
id: generateUUID(), |
|
title: "", |
|
posts: [], |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for(const postTokens of tokens.split("<|end_of_post|>").slice(0, -1)) { |
|
console.log("Post tokens:") |
|
console.log(postTokens); |
|
|
|
|
|
if(topic.posts.length < 1) { |
|
const titleMatch = postTokens.match(titleRegex); |
|
console.log(`title: ${titleMatch[1]}`) |
|
|
|
topic.title = titleMatch[1]; |
|
} |
|
|
|
|
|
topic.posts = topic.posts.concat(tokensToPosts(postTokens)); |
|
} |
|
|
|
return topic; |
|
} |
|
|
|
export function tokensToPosts(tokens: string): Post[] { |
|
const posts: Post[] = []; |
|
|
|
for(const postTokens of tokens.split("<|end_of_post|>")) { |
|
|
|
|
|
if(postTokens.length < 1) { |
|
continue; |
|
} |
|
|
|
console.log("Post tokens:") |
|
console.log(postTokens); |
|
|
|
const userMatch = postTokens.match(userRegex); |
|
console.log(`user: ${userMatch[1]}`) |
|
|
|
const dateMatch = postTokens.match(dateRegex); |
|
console.log(`date: ${dateMatch[1]}`) |
|
|
|
const contentMatch = postTokens.match(contentRegex); |
|
console.log(`content: ${contentMatch[1]}`) |
|
|
|
posts.push({ |
|
user: userMatch[1], |
|
date: frenchToIso8601(dateMatch[1]), |
|
content: contentMatch[1], |
|
}); |
|
} |
|
|
|
return posts; |
|
} |
|
|
|
|
|
export function tokenizeTopic(topic: Topic): string { |
|
if (topic.posts.length === 0) { |
|
throw new Error("Topic must have at least one post") |
|
} |
|
|
|
const tokenizedPosts = topic.posts.map(post => tokenizePost(post, topic.posts[0].user)).flat().join(""); |
|
|
|
|
|
|
|
|
|
let lines = [ |
|
"<|start_header_id|><|sujet|><|end_header_id|>", |
|
"", |
|
`Sujet : "${topic.title}"`, |
|
]; |
|
|
|
return lines.join("\n") + tokenizedPosts; |
|
} |
|
|
|
export function tokenizePost(post: Post, poster: string): string { |
|
let lines = [ |
|
`<|eot_id|><|start_header_id|><|${post.user === poster ? "autheur" : "khey"}|>`, |
|
"<|end_header_id|>", |
|
"", |
|
`<|im_pseudo|>${post.user}<|end_pseudo|>`, |
|
`<|im_date|>Le ${iso8601ToFrench(post.date)}<|end_date|>`, |
|
"", |
|
`<|begin_of_post|>${post.content}<|end_of_post|>` |
|
]; |
|
|
|
return lines.join("\n"); |
|
} |