vilarin commited on
Commit
55feb21
·
verified ·
1 Parent(s): cdebb6d

Delete process.py

Browse files
Files changed (1) hide show
  1. process.py +0 -213
process.py DELETED
@@ -1,213 +0,0 @@
1
- import gradio as gr
2
- from simplemma import simple_tokenizer
3
- from difflib import Differ
4
- from icecream import ic
5
- from app.webui.patch import model_load,num_tokens_in_string,one_chunk_initial_translation, one_chunk_reflect_on_translation, one_chunk_improve_translation
6
- from app.webui.patch import calculate_chunk_size, multichunk_initial_translation, multichunk_reflect_on_translation, multichunk_improve_translation
7
-
8
- from llama_index.core.node_parser import SentenceSplitter
9
-
10
- def tokenize(text):
11
- # Use nltk to tokenize the text
12
- words = simple_tokenizer(text)
13
- # Check if the text contains spaces
14
- if ' ' in text:
15
- # Create a list of words and spaces
16
- tokens = []
17
- for word in words:
18
- tokens.append(word)
19
- if not word.startswith("'") and not word.endswith("'"): # Avoid adding space after punctuation
20
- tokens.append(' ') # Add space after each word
21
- return tokens[:-1] # Remove the last space
22
- else:
23
- return words
24
-
25
- def diff_texts(text1, text2):
26
- tokens1 = tokenize(text1)
27
- tokens2 = tokenize(text2)
28
-
29
- d = Differ()
30
- diff_result = list(d.compare(tokens1, tokens2))
31
-
32
- highlighted_text = []
33
- for token in diff_result:
34
- word = token[2:]
35
- category = None
36
- if token[0] == '+':
37
- category = 'added'
38
- elif token[0] == '-':
39
- category = 'removed'
40
- elif token[0] == '?':
41
- continue # Ignore the hints line
42
-
43
- highlighted_text.append((word, category))
44
-
45
- return highlighted_text
46
-
47
- #modified from src.translaation-agent.utils.tranlsate
48
- def translator(
49
- source_lang: str,
50
- target_lang: str,
51
- source_text: str,
52
- country: str,
53
- max_tokens:int = 1000,
54
- ):
55
-
56
- """Translate the source_text from source_lang to target_lang."""
57
- num_tokens_in_text = num_tokens_in_string(source_text)
58
-
59
- ic(num_tokens_in_text)
60
-
61
- if num_tokens_in_text < max_tokens:
62
- ic("Translating text as single chunk")
63
-
64
- #Note: use yield from B() if put yield in function B()
65
- init_translation = one_chunk_initial_translation(
66
- source_lang, target_lang, source_text
67
- )
68
-
69
-
70
- reflection = one_chunk_reflect_on_translation(
71
- source_lang, target_lang, source_text, init_translation, country
72
- )
73
-
74
- final_translation = one_chunk_improve_translation(
75
- source_lang, target_lang, source_text, init_translation, reflection
76
- )
77
-
78
- return init_translation, reflection, final_translation
79
-
80
- else:
81
- ic("Translating text as multiple chunks")
82
-
83
- token_size = calculate_chunk_size(
84
- token_count=num_tokens_in_text, token_limit=max_tokens
85
- )
86
-
87
- ic(token_size)
88
-
89
- #using sentence splitter
90
- text_parser = SentenceSplitter(
91
- chunk_size=token_size,
92
- )
93
-
94
- source_text_chunks = text_parser.split_text(source_text)
95
-
96
- translation_1_chunks = multichunk_initial_translation(
97
- source_lang, target_lang, source_text_chunks
98
- )
99
-
100
- init_translation = "".join(translation_1_chunks)
101
-
102
- reflection_chunks = multichunk_reflect_on_translation(
103
- source_lang,
104
- target_lang,
105
- source_text_chunks,
106
- translation_1_chunks,
107
- country,
108
- )
109
-
110
- reflection = "".join(reflection_chunks)
111
-
112
- translation_2_chunks = multichunk_improve_translation(
113
- source_lang,
114
- target_lang,
115
- source_text_chunks,
116
- translation_1_chunks,
117
- reflection_chunks,
118
- )
119
-
120
- final_translation = "".join(translation_2_chunks)
121
-
122
- return init_translation, reflection, final_translation
123
-
124
-
125
- def translator_sec(
126
- endpoint2: str,
127
- model2: str,
128
- api_key2: str,
129
- context_window: int,
130
- num_output: int,
131
- source_lang: str,
132
- target_lang: str,
133
- source_text: str,
134
- country: str,
135
- max_tokens: int = 1000,
136
- ):
137
-
138
- """Translate the source_text from source_lang to target_lang."""
139
- num_tokens_in_text = num_tokens_in_string(source_text)
140
-
141
- ic(num_tokens_in_text)
142
-
143
- if num_tokens_in_text < max_tokens:
144
- ic("Translating text as single chunk")
145
-
146
- #Note: use yield from B() if put yield in function B()
147
- init_translation = one_chunk_initial_translation(
148
- source_lang, target_lang, source_text
149
- )
150
-
151
- try:
152
- model_load(endpoint2, model2, api_key2, context_window, num_output)
153
- except Exception as e:
154
- raise gr.Error(f"An unexpected error occurred: {e}")
155
-
156
- reflection = one_chunk_reflect_on_translation(
157
- source_lang, target_lang, source_text, init_translation, country
158
- )
159
-
160
- final_translation = one_chunk_improve_translation(
161
- source_lang, target_lang, source_text, init_translation, reflection
162
- )
163
-
164
- return init_translation, reflection, final_translation
165
-
166
- else:
167
- ic("Translating text as multiple chunks")
168
-
169
- token_size = calculate_chunk_size(
170
- token_count=num_tokens_in_text, token_limit=max_tokens
171
- )
172
-
173
- ic(token_size)
174
-
175
- #using sentence splitter
176
- text_parser = SentenceSplitter(
177
- chunk_size=token_size,
178
- )
179
-
180
- source_text_chunks = text_parser.split_text(source_text)
181
-
182
- translation_1_chunks = multichunk_initial_translation(
183
- source_lang, target_lang, source_text_chunks
184
- )
185
-
186
- init_translation = "".join(translation_1_chunks)
187
-
188
- try:
189
- model_load(endpoint2, model2, api_key2, context_window, num_output)
190
- except Exception as e:
191
- raise gr.Error(f"An unexpected error occurred: {e}")
192
-
193
- reflection_chunks = multichunk_reflect_on_translation(
194
- source_lang,
195
- target_lang,
196
- source_text_chunks,
197
- translation_1_chunks,
198
- country,
199
- )
200
-
201
- reflection = "".join(reflection_chunks)
202
-
203
- translation_2_chunks = multichunk_improve_translation(
204
- source_lang,
205
- target_lang,
206
- source_text_chunks,
207
- translation_1_chunks,
208
- reflection_chunks,
209
- )
210
-
211
- final_translation = "".join(translation_2_chunks)
212
-
213
- return init_translation, reflection, final_translation