File size: 3,145 Bytes
1813a37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1982de5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import {Post, Topic} from "./topics";
import {iso8601ToFrench, frenchToIso8601} from "./dates"
import {generateUUID} from "./uuids";

// const titleRegex = /Sujet\s+:\s+"([^"]+)"/;
const titleRegex = /Sujet\s+:\s+"(.+?)"?<\|eot_id\|>/;
const userRegex = /<\|im_pseudo\|>([^<]+)<\|end_pseudo\|>/;
const dateRegex = /<\|im_date\|>([^<]+)<\|end_date\|>/;
const contentRegex = /<\|begin_of_post\|>([\s\S]+)(?:<\|end_of_post\|>)?$/;

export function tokensToTopic(tokens: string): Topic {
    const topic: Topic = {
        id: generateUUID(),
        title: "",
        posts: [],
    };

    // const splits = tokens.split("<|end_of_post|>")
    // console.log("Splits:")
    // console.log(splits);

    // Split token in posts
    // The last element is always vois, so remove it
    for(const postTokens of tokens.split("<|end_of_post|>").slice(0, -1)) {
        console.log("Post tokens:")
        console.log(postTokens);

        // If it's the first post
        if(topic.posts.length < 1) {
            const titleMatch = postTokens.match(titleRegex);
            console.log(`title: ${titleMatch[1]}`)

            topic.title = titleMatch[1];
        }

        // topic.posts.push(tokensToPosts());
        topic.posts = topic.posts.concat(tokensToPosts(postTokens));
    }

    return topic;
}

export function tokensToPosts(tokens: string): Post[] {
    const posts: Post[] = [];

    for(const postTokens of tokens.split("<|end_of_post|>")) {

        // TODO: remove the last instead of doing this, because the last can be incomplete
        if(postTokens.length < 1) {
            continue;
        }

        console.log("Post tokens:")
        console.log(postTokens);

        const userMatch = postTokens.match(userRegex);
        console.log(`user: ${userMatch[1]}`)

        const dateMatch = postTokens.match(dateRegex);
        console.log(`date: ${dateMatch[1]}`)

        const contentMatch = postTokens.match(contentRegex);
        console.log(`content: ${contentMatch[1]}`)

        posts.push({
            user: userMatch[1],
            date: frenchToIso8601(dateMatch[1]),
            content: contentMatch[1],
        });
    }

    return posts;
}


export function tokenizeTopic(topic: Topic): string {
    if (topic.posts.length === 0) {
        throw new Error("Topic must have at least one post")
    }

    const tokenizedPosts = topic.posts.map(post => tokenizePost(post, topic.posts[0].user)).flat().join("");

    // console.log("Tokenized posts:")
    // console.log(tokenizedPosts)

    let lines = [
        "<|start_header_id|><|sujet|><|end_header_id|>",
        "",
        `Sujet : "${topic.title}"`,
    ];

    return lines.join("\n") + tokenizedPosts;
}

export function tokenizePost(post: Post, poster: string): string {
    let lines = [
        `<|eot_id|><|start_header_id|><|${post.user === poster ? "autheur" : "khey"}|>`,
        "<|end_header_id|>",
        "",
        `<|im_pseudo|>${post.user}<|end_pseudo|>`,
        `<|im_date|>Le ${iso8601ToFrench(post.date)}<|end_date|>`,
        "",
        `<|begin_of_post|>${post.content}<|end_of_post|>`
    ];

    return lines.join("\n");
}