NebulasBellum commited on
Commit
fb6baf2
·
verified ·
1 Parent(s): 6ae10fd

update app file for space for summarisation from russian text

Browse files
Files changed (1) hide show
  1. app.py +24 -77
app.py CHANGED
@@ -1,79 +1,26 @@
1
  import gradio as gr
2
- import tensorflow as tf
3
- import copy
4
- import numpy as np
5
-
6
- def generate_from_saved(seed_text):
7
- # add the start generation of the lukashenko speech from the simple seed
8
- # seed_text = 'я не глядя поддержу'
9
- weights_path = 'results/weights_lukash.h5'
10
- model_path = 'results/Lukashenko_tarakan'
11
-
12
- model = tf.keras.models.load_model(model_path)
13
- model.load_weights(weights_path)
14
- # Show the Model summary
15
- model.summary()
16
-
17
- with open('data/source_text_lukash.txt', 'r') as source_text_file:
18
- data = source_text_file.read().splitlines()
19
-
20
- tmp_data = copy.deepcopy(data)
21
- sent_length = 0
22
- for idx, line in enumerate(data):
23
- if len(line) < 5:
24
- tmp_data.pop(idx)
25
- else:
26
- sent_length += len(line.split())
27
- data = tmp_data
28
- lstm_length = int(sent_length / len(data))
29
-
30
- token = tf.keras.preprocessing.text.Tokenizer()
31
- token.fit_on_texts(data)
32
- encoded_text = token.texts_to_sequences(data)
33
- # Vocabular size
34
- vocab_size = len(token.word_counts) + 1
35
-
36
- datalist = []
37
- for d in encoded_text:
38
- if len(d) > 1:
39
- for i in range(2, len(d)):
40
- datalist.append(d[:i])
41
-
42
- max_length = 20
43
- sequences = tf.keras.preprocessing.sequence.pad_sequences(datalist, maxlen=max_length, padding='pre')
44
-
45
- # X - input data, y - target data
46
- X = sequences[:, :-1]
47
- y = sequences[:, -1]
48
-
49
- y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
50
- seq_length = X.shape[1]
51
- print(f"Sequence length: {seq_length}")
52
-
53
- generated_text = ''
54
- number_lines = 3
55
- for i in range(number_lines):
56
- text_word_list = []
57
- for _ in range(lstm_length * 2):
58
- encoded = token.texts_to_sequences([seed_text])
59
- encoded = tf.keras.preprocessing.sequence.pad_sequences(encoded, maxlen=seq_length, padding='pre')
60
-
61
- y_pred = np.argmax(model.predict(encoded), axis=-1)
62
-
63
- predicted_word = ""
64
- for word, index in token.word_index.items():
65
- if index == y_pred:
66
- predicted_word = word
67
- break
68
-
69
- seed_text = seed_text + ' ' + predicted_word
70
- text_word_list.append(predicted_word)
71
-
72
- seed_text = text_word_list [-1]
73
- generated_text = ' '.join(text_word_list)
74
- generated_text += '\n'
75
-
76
- return generated_text
77
-
78
- demo = gr.Interface(fn=generate_from_saved, inputs="text", outputs="text")
79
  demo.launch(share=True)
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import GPT2Tokenizer, T5ForConditionalGeneration
4
+
5
+
6
+ tokenizer = GPT2Tokenizer.from_pretrained('RussianNLP/FRED-T5-Summarizer', eos_token='</s>')
7
+ model = T5ForConditionalGeneration.from_pretrained('RussianNLP/FRED-T5-Summarizer')
8
+ device = 'cuda'
9
+ model.to(device)
10
+
11
+ input_text = "<LM> Сократи текст.\n "
12
+
13
+ def make_summarization(user_text):
14
+ processing_text = input_text + user_text
15
+ input_ids = torch.tensor([tokenizer.encode(processing_text)]).to(device)
16
+ outputs = model.generate(input_ids, eos_token_id=tokenizer.eos_token_id,
17
+ num_beams=3,
18
+ min_new_tokens=17,
19
+ max_new_tokens=200,
20
+ do_sample=True,
21
+ no_repeat_ngram_size=4,
22
+ top_p=0.9)
23
+ return tokenizer.decode(outputs[0][1:])
24
+
25
+ demo = gr.Interface(fn=make_summarization, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  demo.launch(share=True)