vilarin commited on
Commit
537af88
·
verified ·
1 Parent(s): 66cd578

Update app/webui/process.py

Browse files
Files changed (1) hide show
  1. app/webui/process.py +243 -226
app/webui/process.py CHANGED
@@ -1,227 +1,244 @@
1
- import gradio as gr
2
- from simplemma import simple_tokenizer
3
- from difflib import Differ
4
- from icecream import ic
5
- from app.webui.patch import model_load,num_tokens_in_string,one_chunk_initial_translation, one_chunk_reflect_on_translation, one_chunk_improve_translation
6
- from app.webui.patch import calculate_chunk_size, multichunk_initial_translation, multichunk_reflect_on_translation, multichunk_improve_translation
7
-
8
- from llama_index.core.node_parser import SentenceSplitter
9
-
10
- progress=gr.Progress()
11
-
12
- def tokenize(text):
13
- # Use nltk to tokenize the text
14
- words = simple_tokenizer(text)
15
- # Check if the text contains spaces
16
- if ' ' in text:
17
- # Create a list of words and spaces
18
- tokens = []
19
- for word in words:
20
- tokens.append(word)
21
- if not word.startswith("'") and not word.endswith("'"): # Avoid adding space after punctuation
22
- tokens.append(' ') # Add space after each word
23
- return tokens[:-1] # Remove the last space
24
- else:
25
- return words
26
-
27
- def diff_texts(text1, text2):
28
- tokens1 = tokenize(text1)
29
- tokens2 = tokenize(text2)
30
-
31
- d = Differ()
32
- diff_result = list(d.compare(tokens1, tokens2))
33
-
34
- highlighted_text = []
35
- for token in diff_result:
36
- word = token[2:]
37
- category = None
38
- if token[0] == '+':
39
- category = 'added'
40
- elif token[0] == '-':
41
- category = 'removed'
42
- elif token[0] == '?':
43
- continue # Ignore the hints line
44
-
45
- highlighted_text.append((word, category))
46
-
47
- return highlighted_text
48
-
49
- #modified from src.translaation-agent.utils.tranlsate
50
- def translator(
51
- source_lang: str,
52
- target_lang: str,
53
- source_text: str,
54
- country: str,
55
- max_tokens:int = 1000,
56
- ):
57
- """Translate the source_text from source_lang to target_lang."""
58
- num_tokens_in_text = num_tokens_in_string(source_text)
59
-
60
- ic(num_tokens_in_text)
61
-
62
- if num_tokens_in_text < max_tokens:
63
- ic("Translating text as single chunk")
64
-
65
- progress((1,3), desc="First translation...")
66
- init_translation = one_chunk_initial_translation(
67
- source_lang, target_lang, source_text
68
- )
69
-
70
- progress((2,3), desc="Reflecton...")
71
- reflection = one_chunk_reflect_on_translation(
72
- source_lang, target_lang, source_text, init_translation, country
73
- )
74
-
75
- progress((3,3), desc="Second translation...")
76
- final_translation = one_chunk_improve_translation(
77
- source_lang, target_lang, source_text, init_translation, reflection
78
- )
79
-
80
- return init_translation, reflection, final_translation
81
-
82
- else:
83
- ic("Translating text as multiple chunks")
84
-
85
- progress((1,5), desc="Calculate chunk size...")
86
- token_size = calculate_chunk_size(
87
- token_count=num_tokens_in_text, token_limit=max_tokens
88
- )
89
-
90
- ic(token_size)
91
-
92
- #using sentence splitter
93
- text_parser = SentenceSplitter(
94
- chunk_size=token_size,
95
- )
96
-
97
- progress((2,5), desc="Spilt source text...")
98
- source_text_chunks = text_parser.split_text(source_text)
99
-
100
- progress((3,5), desc="First translation...")
101
- translation_1_chunks = multichunk_initial_translation(
102
- source_lang, target_lang, source_text_chunks
103
- )
104
-
105
- init_translation = "".join(translation_1_chunks)
106
-
107
- progress((4,5), desc="Reflection...")
108
- reflection_chunks = multichunk_reflect_on_translation(
109
- source_lang,
110
- target_lang,
111
- source_text_chunks,
112
- translation_1_chunks,
113
- country,
114
- )
115
-
116
- reflection = "".join(reflection_chunks)
117
-
118
- progress((5,5), desc="Second translation...")
119
- translation_2_chunks = multichunk_improve_translation(
120
- source_lang,
121
- target_lang,
122
- source_text_chunks,
123
- translation_1_chunks,
124
- reflection_chunks,
125
- )
126
-
127
- final_translation = "".join(translation_2_chunks)
128
-
129
- return init_translation, reflection, final_translation
130
-
131
-
132
- def translator_sec(
133
- endpoint2: str,
134
- model2: str,
135
- api_key2: str,
136
- context_window: int,
137
- num_output: int,
138
- source_lang: str,
139
- target_lang: str,
140
- source_text: str,
141
- country: str,
142
- max_tokens: int = 1000,
143
- ):
144
-
145
- """Translate the source_text from source_lang to target_lang."""
146
- num_tokens_in_text = num_tokens_in_string(source_text)
147
-
148
- ic(num_tokens_in_text)
149
-
150
- if num_tokens_in_text < max_tokens:
151
- ic("Translating text as single chunk")
152
-
153
- progress((1,3), desc="First translation...")
154
- init_translation = one_chunk_initial_translation(
155
- source_lang, target_lang, source_text
156
- )
157
-
158
- try:
159
- model_load(endpoint2, model2, api_key2, context_window, num_output)
160
- except Exception as e:
161
- raise gr.Error(f"An unexpected error occurred: {e}")
162
-
163
- progress((2,3), desc="Reflecton...")
164
- reflection = one_chunk_reflect_on_translation(
165
- source_lang, target_lang, source_text, init_translation, country
166
- )
167
-
168
- progress((3,3), desc="Second translation...")
169
- final_translation = one_chunk_improve_translation(
170
- source_lang, target_lang, source_text, init_translation, reflection
171
- )
172
-
173
- return init_translation, reflection, final_translation
174
-
175
- else:
176
- ic("Translating text as multiple chunks")
177
-
178
- progress((1,5), desc="Calculate chunk size...")
179
- token_size = calculate_chunk_size(
180
- token_count=num_tokens_in_text, token_limit=max_tokens
181
- )
182
-
183
- ic(token_size)
184
-
185
- #using sentence splitter
186
- text_parser = SentenceSplitter(
187
- chunk_size=token_size,
188
- )
189
-
190
- progress((2,5), desc="Spilt source text...")
191
- source_text_chunks = text_parser.split_text(source_text)
192
-
193
- progress((3,5), desc="First translation...")
194
- translation_1_chunks = multichunk_initial_translation(
195
- source_lang, target_lang, source_text_chunks
196
- )
197
-
198
- init_translation = "".join(translation_1_chunks)
199
-
200
- try:
201
- model_load(endpoint2, model2, api_key2, context_window, num_output)
202
- except Exception as e:
203
- raise gr.Error(f"An unexpected error occurred: {e}")
204
-
205
- progress((4,5), desc="Reflection...")
206
- reflection_chunks = multichunk_reflect_on_translation(
207
- source_lang,
208
- target_lang,
209
- source_text_chunks,
210
- translation_1_chunks,
211
- country,
212
- )
213
-
214
- reflection = "".join(reflection_chunks)
215
-
216
- progress((5,5), desc="Second translation...")
217
- translation_2_chunks = multichunk_improve_translation(
218
- source_lang,
219
- target_lang,
220
- source_text_chunks,
221
- translation_1_chunks,
222
- reflection_chunks,
223
- )
224
-
225
- final_translation = "".join(translation_2_chunks)
226
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  return init_translation, reflection, final_translation
 
1
+ import gradio as gr
2
+ from simplemma import simple_tokenizer
3
+ from difflib import Differ
4
+ from icecream import ic
5
+ from app.webui.patch import model_load,num_tokens_in_string,one_chunk_initial_translation, one_chunk_reflect_on_translation, one_chunk_improve_translation
6
+ from app.webui.patch import calculate_chunk_size, multichunk_initial_translation, multichunk_reflect_on_translation, multichunk_improve_translation
7
+
8
+ from llama_index.core.node_parser import SentenceSplitter
9
+ from translatepy.exceptions import UnknownLanguage
10
+ from translatepy.translators.google import GoogleTranslate
11
+ gtranslate = GoogleTranslate()
12
+
13
+ progress=gr.Progress()
14
+
15
+ def tokenize(text):
16
+ # Use nltk to tokenize the text
17
+ words = simple_tokenizer(text)
18
+ # Check if the text contains spaces
19
+ if ' ' in text:
20
+ # Create a list of words and spaces
21
+ tokens = []
22
+ for word in words:
23
+ tokens.append(word)
24
+ if not word.startswith("'") and not word.endswith("'"): # Avoid adding space after punctuation
25
+ tokens.append(' ') # Add space after each word
26
+ return tokens[:-1] # Remove the last space
27
+ else:
28
+ return words
29
+
30
+ def diff_texts(text1, text2):
31
+ tokens1 = tokenize(text1)
32
+ tokens2 = tokenize(text2)
33
+
34
+ d = Differ()
35
+ diff_result = list(d.compare(tokens1, tokens2))
36
+
37
+ highlighted_text = []
38
+ for token in diff_result:
39
+ word = token[2:]
40
+ category = None
41
+ if token[0] == '+':
42
+ category = 'added'
43
+ elif token[0] == '-':
44
+ category = 'removed'
45
+ elif token[0] == '?':
46
+ continue # Ignore the hints line
47
+
48
+ highlighted_text.append((word, category))
49
+
50
+ return highlighted_text
51
+
52
+ #modified from src.translaation-agent.utils.tranlsate
53
+ def translator(
54
+ source_lang: str,
55
+ target_lang: str,
56
+ source_text: str,
57
+ country: str,
58
+ max_tokens:int = 1000,
59
+ ):
60
+ """Translate the source_text from source_lang to target_lang."""
61
+ num_tokens_in_text = num_tokens_in_string(source_text)
62
+
63
+ ic(num_tokens_in_text)
64
+
65
+ if num_tokens_in_text < max_tokens:
66
+ ic("Translating text as single chunk")
67
+
68
+ progress((1,3), desc="First translation...")
69
+ init_translation = one_chunk_initial_translation(
70
+ source_lang, target_lang, source_text
71
+ )
72
+
73
+ progress((2,3), desc="Reflecton...")
74
+ reflection = one_chunk_reflect_on_translation(
75
+ source_lang, target_lang, source_text, init_translation, country
76
+ )
77
+
78
+ progress((3,3), desc="Second translation...")
79
+ final_translation = one_chunk_improve_translation(
80
+ source_lang, target_lang, source_text, init_translation, reflection
81
+ )
82
+
83
+ return init_translation, reflection, final_translation
84
+
85
+ else:
86
+ ic("Translating text as multiple chunks")
87
+
88
+ progress((1,5), desc="Calculate chunk size...")
89
+ token_size = calculate_chunk_size(
90
+ token_count=num_tokens_in_text, token_limit=max_tokens
91
+ )
92
+
93
+ ic(token_size)
94
+
95
+ #using sentence splitter
96
+ text_parser = SentenceSplitter(
97
+ chunk_size=token_size,
98
+ )
99
+
100
+ progress((2,5), desc="Spilt source text...")
101
+ source_text_chunks = text_parser.split_text(source_text)
102
+
103
+ progress((3,5), desc="First translation...")
104
+ translation_1_chunks = multichunk_initial_translation(
105
+ source_lang, target_lang, source_text_chunks
106
+ )
107
+
108
+ init_translation = "".join(translation_1_chunks)
109
+
110
+ progress((4,5), desc="Reflection...")
111
+ reflection_chunks = multichunk_reflect_on_translation(
112
+ source_lang,
113
+ target_lang,
114
+ source_text_chunks,
115
+ translation_1_chunks,
116
+ country,
117
+ )
118
+
119
+ reflection = "".join(reflection_chunks)
120
+
121
+ progress((5,5), desc="Second translation...")
122
+ translation_2_chunks = multichunk_improve_translation(
123
+ source_lang,
124
+ target_lang,
125
+ source_text_chunks,
126
+ translation_1_chunks,
127
+ reflection_chunks,
128
+ )
129
+
130
+ final_translation = "".join(translation_2_chunks)
131
+
132
+ return init_translation, reflection, final_translation
133
+
134
+
135
+ def translator_sec(
136
+ endpoint2: str,
137
+ model2: str,
138
+ api_key2: str,
139
+ context_window: int,
140
+ num_output: int,
141
+ source_lang: str,
142
+ target_lang: str,
143
+ source_text: str,
144
+ country: str,
145
+ max_tokens: int = 1000,
146
+ gt: bool = False,
147
+ ):
148
+
149
+ """Translate the source_text from source_lang to target_lang."""
150
+ num_tokens_in_text = num_tokens_in_string(source_text)
151
+
152
+ ic(num_tokens_in_text)
153
+
154
+ if num_tokens_in_text < max_tokens:
155
+ ic("Translating text as single chunk")
156
+
157
+ progress((1,3), desc="First translation...")
158
+ if gt:
159
+ try:
160
+ language = Language(target_lang)
161
+ except Exception as e:
162
+ raise gr.Error(f"An unexpected error occurred: {e}")
163
+ init_translation = gtranslate.translate(source_text, language)
164
+ else:
165
+ init_translation = one_chunk_initial_translation(
166
+ source_lang, target_lang, source_text
167
+ )
168
+ try:
169
+ model_load(endpoint2, model2, api_key2, context_window, num_output)
170
+ except Exception as e:
171
+ raise gr.Error(f"An unexpected error occurred: {e}")
172
+
173
+ progress((2,3), desc="Reflecton...")
174
+ reflection = one_chunk_reflect_on_translation(
175
+ source_lang, target_lang, source_text, init_translation, country
176
+ )
177
+
178
+ progress((3,3), desc="Second translation...")
179
+ final_translation = one_chunk_improve_translation(
180
+ source_lang, target_lang, source_text, init_translation, reflection
181
+ )
182
+
183
+ return init_translation, reflection, final_translation
184
+
185
+ else:
186
+ ic("Translating text as multiple chunks")
187
+
188
+ progress((1,5), desc="Calculate chunk size...")
189
+ token_size = calculate_chunk_size(
190
+ token_count=num_tokens_in_text, token_limit=max_tokens
191
+ )
192
+
193
+ ic(token_size)
194
+
195
+ #using sentence splitter
196
+ text_parser = SentenceSplitter(
197
+ chunk_size=token_size,
198
+ )
199
+
200
+ progress((2,5), desc="Spilt source text...")
201
+ source_text_chunks = text_parser.split_text(source_text)
202
+
203
+ progress((3,5), desc="First translation...")
204
+ if gt:
205
+ try:
206
+ language = Language(target_lang)
207
+ except Exception as e:
208
+ raise gr.Error(f"An unexpected error occurred: {e}")
209
+ translation_1_chunks = gtranslate.translate(source_text_chunks, language)
210
+ else:
211
+ translation_1_chunks = multichunk_initial_translation(
212
+ source_lang, target_lang, source_text_chunks
213
+ )
214
+
215
+ init_translation = "".join(translation_1_chunks)
216
+
217
+ try:
218
+ model_load(endpoint2, model2, api_key2, context_window, num_output)
219
+ except Exception as e:
220
+ raise gr.Error(f"An unexpected error occurred: {e}")
221
+
222
+ progress((4,5), desc="Reflection...")
223
+ reflection_chunks = multichunk_reflect_on_translation(
224
+ source_lang,
225
+ target_lang,
226
+ source_text_chunks,
227
+ translation_1_chunks,
228
+ country,
229
+ )
230
+
231
+ reflection = "".join(reflection_chunks)
232
+
233
+ progress((5,5), desc="Second translation...")
234
+ translation_2_chunks = multichunk_improve_translation(
235
+ source_lang,
236
+ target_lang,
237
+ source_text_chunks,
238
+ translation_1_chunks,
239
+ reflection_chunks,
240
+ )
241
+
242
+ final_translation = "".join(translation_2_chunks)
243
+
244
  return init_translation, reflection, final_translation