File size: 2,435 Bytes
c8ab7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import tensorflow as tf
import copy
import numpy as np


def generate_from_saved():
    # add the start generation of the lukashenko speech from the simple seed
    seed_text = 'я не глядя поддержу'
    weights_path = 'results/weights_lukash.h5'
    model_path = 'results/Lukashenko_tarakan'

    model = tf.keras.models.load_model(model_path)
    model.load_weights(weights_path)
    # Show the Model summary
    model.summary()

    with open('data/source_text_lukash.txt', 'r') as source_text_file:
        data = source_text_file.read().splitlines()

    tmp_data = copy.deepcopy(data)
    sent_length = 0
    for idx, line in enumerate(data):
        if len(line) < 5:
            tmp_data.pop(idx)
        else:
            sent_length += len(line.split())
    data = tmp_data
    lstm_length = int(sent_length / len(data))

    token = tf.keras.preprocessing.text.Tokenizer()
    token.fit_on_texts(data)
    encoded_text = token.texts_to_sequences(data)
    # Vocabular size
    vocab_size = len(token.word_counts) + 1

    datalist = []
    for d in encoded_text:
        if len(d) > 1:
            for i in range(2, len(d)):
                datalist.append(d[:i])

    max_length = 20
    sequences = tf.keras.preprocessing.sequence.pad_sequences(datalist, maxlen=max_length, padding='pre')

    # X - input data, y - target data
    X = sequences[:, :-1]
    y = sequences[:, -1]

    y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
    seq_length = X.shape[1]
    print(f"Sequence length: {seq_length}")

    generated_text = ''
    number_lines = 3
    for i in range(number_lines):
        text_word_list = []
        for _ in range(lstm_length * 2):
            encoded = token.texts_to_sequences([seed_text])
            encoded = tf.keras.preprocessing.sequence.pad_sequences(encoded, maxlen=seq_length, padding='pre')

            y_pred = np.argmax(model.predict(encoded), axis=-1)

            predicted_word = ""
            for word, index in token.word_index.items():
                if index == y_pred:
                    predicted_word = word
                    break

            seed_text = seed_text + ' ' + predicted_word
            text_word_list.append(predicted_word)

        seed_text = text_word_list [-1]
        generated_text = ' '.join(text_word_list)
        generated_text += '\n'

    print(f"Lukashenko are saying: {generated_text}")
    return generated_text