akarshan11 commited on
Commit
b113724
·
verified ·
1 Parent(s): 476dd48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -86
app.py CHANGED
@@ -1,117 +1,266 @@
1
  import os
2
  import gradio as gr
3
  import torch
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 
 
 
 
 
 
 
 
5
 
6
- # First, let's create a simpler interface without complex schema handling
7
- # Define languages
8
- LANGUAGES = {
9
- "English": "en_XX",
10
- "Hindi": "hi_IN",
11
- "Bengali": "bn_IN",
12
- "Tamil": "ta_IN",
13
- "Telugu": "te_IN",
14
- "Malayalam": "ml_IN",
15
- "Urdu": "ur_PK"
16
  }
17
 
18
- # Initialize model and tokenizer
19
- model_name = "facebook/mbart-large-50-many-to-many-mmt"
20
- tokenizer = None
21
- model = None
22
 
23
- def load_model():
24
- global tokenizer, model
25
- if tokenizer is None:
26
- tokenizer = AutoTokenizer.from_pretrained(model_name)
27
- if model is None:
28
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
29
- if torch.cuda.is_available():
30
- model = model.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def translate_text(text, source_lang, target_lang):
33
- """Simple translation function"""
34
  if not text:
35
- return "Please enter some text to translate."
36
-
37
  try:
38
- load_model()
 
 
 
 
 
 
39
 
40
- # Get language codes
41
- src_lang = LANGUAGES.get(source_lang)
42
- tgt_lang = LANGUAGES.get(target_lang)
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Set source language
45
- tokenizer.src_lang = src_lang
46
 
47
- # Tokenize
48
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
49
- if torch.cuda.is_available():
50
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
 
 
51
 
52
- # Generate translation
53
- with torch.no_grad():
54
- generated_tokens = model.generate(
55
- **inputs,
56
- forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
57
- max_length=512,
58
- num_beams=4,
59
- early_stopping=True
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # Decode
63
- translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
64
- return translation
65
-
66
  except Exception as e:
67
- return f"Translation Error: {str(e)}"
68
 
69
  # Create Gradio interface
70
- with gr.Blocks(title="Simple Language Translator") as demo:
71
- gr.Markdown("# Simple Language Translator")
72
 
73
- with gr.Row():
74
- with gr.Column():
75
- input_text = gr.Textbox(
76
- label="Input Text",
77
- placeholder="Enter text to translate...",
78
- lines=5
79
- )
80
- source_lang = gr.Dropdown(
81
- choices=list(LANGUAGES.keys()),
82
- value="English",
83
- label="Source Language"
84
- )
85
- target_lang = gr.Dropdown(
86
- choices=list(LANGUAGES.keys()),
87
- value="Hindi",
88
- label="Target Language"
89
- )
90
- translate_btn = gr.Button("Translate")
 
 
 
 
 
 
 
 
91
 
92
- with gr.Column():
93
- output_text = gr.Textbox(
94
- label="Translation",
95
- lines=5
96
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- # Set up translation event
99
  translate_btn.click(
100
  fn=translate_text,
101
  inputs=[input_text, source_lang, target_lang],
102
  outputs=output_text
103
  )
104
 
105
- # Add examples
106
- gr.Examples(
107
- examples=[
108
- ["Hello, how are you?", "English", "Hindi"],
109
- ["नमस्ते, कैसे हैं आप?", "Hindi", "English"],
110
- ],
111
- inputs=[input_text, source_lang, target_lang],
112
- outputs=output_text,
113
- fn=translate_text,
114
- cache_examples=True,
115
  )
116
 
117
  if __name__ == "__main__":
 
1
  import os
2
  import gradio as gr
3
  import torch
4
+ from transformers import (
5
+ MarianMTModel, MarianTokenizer,
6
+ T5Tokenizer, T5ForConditionalGeneration,
7
+ pipeline
8
+ )
9
+ import fitz # PyMuPDF
10
+ import docx2txt
11
+ from fpdf import FPDF
12
+ from transformers import AutoModelForSeq2SeqLegacy, AutoTokenizer
13
+ import spacy
14
+ import re
15
 
16
+ # Language mappings for MarianMT models
17
+ LANGUAGE_PAIRS = {
18
+ "English-Hindi": "Helsinki-NLP/opus-mt-en-hi",
19
+ "Hindi-English": "Helsinki-NLP/opus-mt-hi-en",
20
+ "English-Tamil": "Helsinki-NLP/opus-mt-en-tam",
21
+ "Tamil-English": "Helsinki-NLP/opus-mt-tam-en",
22
+ "English-Telugu": "Helsinki-NLP/opus-mt-en-tel",
23
+ "Telugu-English": "Helsinki-NLP/opus-mt-tel-en",
 
 
24
  }
25
 
26
+ # Initialize models dictionary
27
+ models = {}
28
+ tokenizers = {}
 
29
 
30
+ def load_model_for_pair(source_lang, target_lang):
31
+ """Load appropriate model for language pair"""
32
+ pair = f"{source_lang}-{target_lang}"
33
+ if pair not in models:
34
+ try:
35
+ model_name = LANGUAGE_PAIRS.get(pair)
36
+ if model_name:
37
+ tokenizers[pair] = MarianTokenizer.from_pretrained(model_name)
38
+ models[pair] = MarianMTModel.from_pretrained(model_name)
39
+ if torch.cuda.is_available():
40
+ models[pair] = models[pair].to("cuda")
41
+ else:
42
+ # Fallback to T5 for unsupported language pairs
43
+ tokenizers[pair] = T5Tokenizer.from_pretrained("t5-base")
44
+ models[pair] = T5ForConditionalGeneration.from_pretrained("t5-base")
45
+ if torch.cuda.is_available():
46
+ models[pair] = models[pair].to("cuda")
47
+ except Exception as e:
48
+ print(f"Error loading model for {pair}: {str(e)}")
49
+ return None, None
50
+
51
+ return models.get(pair), tokenizers.get(pair)
52
+
53
+ # Text extraction functions
54
+ def extract_text_from_pdf(file_path):
55
+ """Extract text from PDF while preserving structure"""
56
+ try:
57
+ text_blocks = []
58
+ doc = fitz.open(file_path)
59
+ for page in doc:
60
+ # Get text blocks with position information
61
+ blocks = page.get_text("blocks")
62
+ # Sort blocks by vertical position then horizontal
63
+ blocks.sort(key=lambda b: (b[1], b[0]))
64
+ for b in blocks:
65
+ text_blocks.append(b[4]) # b[4] contains the text
66
+ return "\n\n".join(text_blocks)
67
+ except Exception as e:
68
+ return f"Error extracting PDF text: {str(e)}"
69
+
70
+ def extract_text_from_docx(file_path):
71
+ """Extract text from DOCX with structure preservation"""
72
+ try:
73
+ text = docx2txt.process(file_path)
74
+ # Clean up excessive newlines while preserving paragraphs
75
+ text = re.sub(r'\n\s*\n', '\n\n', text)
76
+ return text
77
+ except Exception as e:
78
+ return f"Error extracting DOCX text: {str(e)}"
79
+
80
+ def save_as_pdf(text, output_path):
81
+ """Save translated text as PDF with formatting"""
82
+ try:
83
+ pdf = FPDF()
84
+ pdf.add_page()
85
+ pdf.set_auto_page_break(auto=True, margin=15)
86
+ pdf.set_font("Arial", size=12)
87
+
88
+ # Split text into paragraphs
89
+ paragraphs = text.split('\n\n')
90
+
91
+ for para in paragraphs:
92
+ # Add paragraph with spacing
93
+ pdf.multi_cell(0, 10, para.strip())
94
+ pdf.ln(5) # Add some space between paragraphs
95
+
96
+ pdf.output(output_path)
97
+ return output_path
98
+ except Exception as e:
99
+ return f"Error creating PDF: {str(e)}"
100
+
101
+ def preprocess_text(text):
102
+ """Preprocess text to handle idioms and maintain context"""
103
+ # Split into manageable chunks while preserving context
104
+ chunks = []
105
+ sentences = text.split('.')
106
+ current_chunk = []
107
+ current_length = 0
108
+
109
+ for sentence in sentences:
110
+ sentence = sentence.strip()
111
+ if not sentence:
112
+ continue
113
+
114
+ if current_length + len(sentence) < 512:
115
+ current_chunk.append(sentence)
116
+ current_length += len(sentence)
117
+ else:
118
+ if current_chunk:
119
+ chunks.append('. '.join(current_chunk) + '.')
120
+ current_chunk = [sentence]
121
+ current_length = len(sentence)
122
+
123
+ if current_chunk:
124
+ chunks.append('. '.join(current_chunk) + '.')
125
+
126
+ return chunks
127
 
128
  def translate_text(text, source_lang, target_lang):
129
+ """Translate text with context preservation"""
130
  if not text:
131
+ return "Please provide text to translate."
132
+
133
  try:
134
+ model, tokenizer = load_model_for_pair(source_lang, target_lang)
135
+ if not model or not tokenizer:
136
+ return "Translation model not available for this language pair."
137
+
138
+ # Preprocess and chunk the text
139
+ chunks = preprocess_text(text)
140
+ translated_chunks = []
141
 
142
+ for chunk in chunks:
143
+ # Prepare input
144
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
145
+ if torch.cuda.is_available():
146
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
147
+
148
+ # Generate translation
149
+ with torch.no_grad():
150
+ outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
151
+
152
+ # Decode translation
153
+ translated_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
154
+ translated_chunks.append(translated_chunk)
155
 
156
+ # Combine translations
157
+ return " ".join(translated_chunks)
158
 
159
+ except Exception as e:
160
+ return f"Translation Error: {str(e)}"
161
+
162
+ def process_document(file, source_lang, target_lang):
163
+ """Process and translate document"""
164
+ if file is None:
165
+ return None, "No file uploaded."
166
 
167
+ try:
168
+ # Extract text based on file type
169
+ file_path = file.name
170
+ if file_path.lower().endswith('.pdf'):
171
+ text = extract_text_from_pdf(file_path)
172
+ elif file_path.lower().endswith('.docx'):
173
+ text = extract_text_from_docx(file_path)
174
+ elif file_path.lower().endswith('.txt'):
175
+ with open(file_path, 'r', encoding='utf-8') as f:
176
+ text = f.read()
177
+ else:
178
+ return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
179
+
180
+ # Translate the extracted text
181
+ translated_text = translate_text(text, source_lang, target_lang)
182
+
183
+ # Save translation as PDF
184
+ output_path = os.path.join(os.path.dirname(file_path),
185
+ f"translated_{os.path.basename(file_path)}.pdf")
186
+ result = save_as_pdf(translated_text, output_path)
187
+
188
+ if isinstance(result, str) and result.startswith("Error"):
189
+ return None, result
190
+
191
+ return output_path, translated_text
192
 
 
 
 
 
193
  except Exception as e:
194
+ return None, f"Error processing document: {str(e)}"
195
 
196
  # Create Gradio interface
197
+ with gr.Blocks(title="Document and Text Translator") as demo:
198
+ gr.Markdown("# Advanced Document and Text Translator")
199
 
200
+ with gr.Tabs():
201
+ with gr.TabItem("Text Translation"):
202
+ with gr.Row():
203
+ with gr.Column():
204
+ input_text = gr.Textbox(
205
+ label="Input Text",
206
+ placeholder="Enter text to translate...",
207
+ lines=5
208
+ )
209
+ source_lang = gr.Dropdown(
210
+ choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
211
+ value="English",
212
+ label="Source Language"
213
+ )
214
+ target_lang = gr.Dropdown(
215
+ choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
216
+ value="Hindi",
217
+ label="Target Language"
218
+ )
219
+ translate_btn = gr.Button("Translate")
220
+
221
+ with gr.Column():
222
+ output_text = gr.Textbox(
223
+ label="Translation",
224
+ lines=5
225
+ )
226
 
227
+ with gr.TabItem("Document Translation"):
228
+ with gr.Row():
229
+ with gr.Column():
230
+ file_input = gr.File(
231
+ label="Upload Document",
232
+ file_types=[".pdf", ".docx", ".txt"]
233
+ )
234
+ doc_source_lang = gr.Dropdown(
235
+ choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
236
+ value="English",
237
+ label="Source Language"
238
+ )
239
+ doc_target_lang = gr.Dropdown(
240
+ choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
241
+ value="Hindi",
242
+ label="Target Language"
243
+ )
244
+ translate_doc_btn = gr.Button("Translate Document")
245
+
246
+ with gr.Column():
247
+ output_file = gr.File(label="Translated PDF")
248
+ output_preview = gr.Textbox(
249
+ label="Translation Preview",
250
+ lines=8
251
+ )
252
 
253
+ # Set up event handlers
254
  translate_btn.click(
255
  fn=translate_text,
256
  inputs=[input_text, source_lang, target_lang],
257
  outputs=output_text
258
  )
259
 
260
+ translate_doc_btn.click(
261
+ fn=process_document,
262
+ inputs=[file_input, doc_source_lang, doc_target_lang],
263
+ outputs=[output_file, output_preview]
 
 
 
 
 
 
264
  )
265
 
266
  if __name__ == "__main__":