akarshan11 commited on
Commit
f7aa0f6
·
verified ·
1 Parent(s): a784937

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -258
app.py CHANGED
@@ -1,273 +1,107 @@
1
- import os
2
  import gradio as gr
3
- import torch
4
- from transformers import (
5
- MarianMTModel, MarianTokenizer,
6
- T5Tokenizer, T5ForConditionalGeneration,
7
- pipeline,
8
- AutoModelForSeq2SeqLM, # Changed from AutoModelForSeq2SeqLegacy
9
- AutoTokenizer
10
- )
11
- import fitz # PyMuPDF
12
- import docx2txt
13
- from fpdf import FPDF
14
- import spacy
15
- import re
16
 
17
- # Language mappings for MarianMT models
18
- LANGUAGE_PAIRS = {
19
- "English-Hindi": "Helsinki-NLP/opus-mt-en-hi",
20
- "Hindi-English": "Helsinki-NLP/opus-mt-hi-en",
21
- "English-Tamil": "Helsinki-NLP/opus-mt-en-tam",
22
- "Tamil-English": "Helsinki-NLP/opus-mt-tam-en",
23
- "English-Telugu": "Helsinki-NLP/opus-mt-en-tel",
24
- "Telugu-English": "Helsinki-NLP/opus-mt-tel-en",
25
- }
26
 
27
- # Initialize models dictionary
28
- models = {}
29
- tokenizers = {}
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- def load_model_for_pair(source_lang, target_lang):
32
- """Load appropriate model for language pair"""
33
- pair = f"{source_lang}-{target_lang}"
34
- if pair not in models:
35
- try:
36
- model_name = LANGUAGE_PAIRS.get(pair)
37
- if model_name:
38
- tokenizers[pair] = MarianTokenizer.from_pretrained(model_name)
39
- models[pair] = MarianMTModel.from_pretrained(model_name)
40
- if torch.cuda.is_available():
41
- models[pair] = models[pair].to("cuda")
42
- else:
43
- # Fallback to T5 for unsupported language pairs
44
- tokenizers[pair] = T5Tokenizer.from_pretrained("t5-base")
45
- models[pair] = T5ForConditionalGeneration.from_pretrained("t5-base")
46
- if torch.cuda.is_available():
47
- models[pair] = models[pair].to("cuda")
48
- except Exception as e:
49
- print(f"Error loading model for {pair}: {str(e)}")
50
- return None, None
51
 
52
- return models.get(pair), tokenizers.get(pair)
 
 
 
 
 
 
 
 
53
 
54
- # Text extraction functions
55
- def extract_text_from_pdf(file_path):
56
- """Extract text from PDF while preserving structure"""
57
- try:
58
- text_blocks = []
59
- doc = fitz.open(file_path)
 
60
  for page in doc:
61
- # Get text blocks with position information
62
- blocks = page.get_text("blocks")
63
- # Sort blocks by vertical position then horizontal
64
- blocks.sort(key=lambda b: (b[1], b[0]))
65
- for b in blocks:
66
- text_blocks.append(b[4]) # b[4] contains the text
67
- return "\n\n".join(text_blocks)
68
- except Exception as e:
69
- return f"Error extracting PDF text: {str(e)}"
70
-
71
- def extract_text_from_docx(file_path):
72
- """Extract text from DOCX with structure preservation"""
73
- try:
74
- text = docx2txt.process(file_path)
75
- # Clean up excessive newlines while preserving paragraphs
76
- text = re.sub(r'\n\s*\n', '\n\n', text)
77
- return text
78
- except Exception as e:
79
- return f"Error extracting DOCX text: {str(e)}"
80
-
81
- def save_as_pdf(text, output_path):
82
- """Save translated text as PDF with formatting"""
83
- try:
84
- pdf = FPDF()
85
- pdf.add_page()
86
- pdf.set_auto_page_break(auto=True, margin=15)
87
- pdf.add_font('DejaVu', '', '/usr/share/fonts/truetype/dejavu/DejaVuSansCondensed.ttf', uni=True)
88
- pdf.set_font('DejaVu', size=12)
89
-
90
- # Split text into paragraphs
91
- paragraphs = text.split('\n\n')
92
-
93
- for para in paragraphs:
94
- # Add paragraph with spacing
95
- try:
96
- pdf.multi_cell(0, 10, para.strip())
97
- pdf.ln(5) # Add some space between paragraphs
98
- except Exception as e:
99
- print(f"Error writing paragraph: {str(e)}")
100
- continue
101
-
102
- pdf.output(output_path)
103
- return output_path
104
- except Exception as e:
105
- return f"Error creating PDF: {str(e)}"
106
-
107
- def preprocess_text(text):
108
- """Preprocess text to handle idioms and maintain context"""
109
- # Split into manageable chunks while preserving context
110
- chunks = []
111
- sentences = text.split('.')
112
- current_chunk = []
113
- current_length = 0
114
 
115
- for sentence in sentences:
116
- sentence = sentence.strip()
117
- if not sentence:
118
- continue
119
-
120
- if current_length + len(sentence) < 512:
121
- current_chunk.append(sentence)
122
- current_length += len(sentence)
123
- else:
124
- if current_chunk:
125
- chunks.append('. '.join(current_chunk) + '.')
126
- current_chunk = [sentence]
127
- current_length = len(sentence)
128
-
129
- if current_chunk:
130
- chunks.append('. '.join(current_chunk) + '.')
131
-
132
- return chunks
133
-
134
- def translate_text(text, source_lang, target_lang):
135
- """Translate text with context preservation"""
136
- if not text:
137
- return "Please provide text to translate."
138
-
139
- try:
140
- model, tokenizer = load_model_for_pair(source_lang, target_lang)
141
- if not model or not tokenizer:
142
- return "Translation model not available for this language pair."
143
-
144
- # Preprocess and chunk the text
145
- chunks = preprocess_text(text)
146
- translated_chunks = []
147
-
148
- for chunk in chunks:
149
- # Prepare input
150
- inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
151
- if torch.cuda.is_available():
152
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
153
-
154
- # Generate translation
155
- with torch.no_grad():
156
- outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
157
-
158
- # Decode translation
159
- translated_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
160
- translated_chunks.append(translated_chunk)
161
-
162
- # Combine translations
163
- return " ".join(translated_chunks)
164
-
165
- except Exception as e:
166
- return f"Translation Error: {str(e)}"
167
-
168
- def process_document(file, source_lang, target_lang):
169
- """Process and translate document"""
170
- if file is None:
171
- return None, "No file uploaded."
172
-
173
- try:
174
- # Extract text based on file type
175
- file_path = file.name
176
- if file_path.lower().endswith('.pdf'):
177
- text = extract_text_from_pdf(file_path)
178
- elif file_path.lower().endswith('.docx'):
179
- text = extract_text_from_docx(file_path)
180
- elif file_path.lower().endswith('.txt'):
181
- with open(file_path, 'r', encoding='utf-8') as f:
182
- text = f.read()
183
- else:
184
- return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
185
-
186
- # Translate the extracted text
187
- translated_text = translate_text(text, source_lang, target_lang)
188
-
189
- # Save translation as PDF
190
- output_path = os.path.join(os.path.dirname(file_path),
191
- f"translated_{os.path.basename(file_path)}.pdf")
192
- result = save_as_pdf(translated_text, output_path)
193
-
194
- if isinstance(result, str) and result.startswith("Error"):
195
- return None, result
196
-
197
- return output_path, translated_text
198
-
199
- except Exception as e:
200
- return None, f"Error processing document: {str(e)}"
201
 
202
- # Create Gradio interface
203
- with gr.Blocks(title="Document and Text Translator") as demo:
204
- gr.Markdown("# Advanced Document and Text Translator")
 
205
 
206
- with gr.Tabs():
207
- with gr.TabItem("Text Translation"):
208
- with gr.Row():
209
- with gr.Column():
210
- input_text = gr.Textbox(
211
- label="Input Text",
212
- placeholder="Enter text to translate...",
213
- lines=5
214
- )
215
- source_lang = gr.Dropdown(
216
- choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
217
- value="English",
218
- label="Source Language"
219
- )
220
- target_lang = gr.Dropdown(
221
- choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
222
- value="Hindi",
223
- label="Target Language"
224
- )
225
- translate_btn = gr.Button("Translate")
226
-
227
- with gr.Column():
228
- output_text = gr.Textbox(
229
- label="Translation",
230
- lines=5
231
- )
232
 
233
- with gr.TabItem("Document Translation"):
234
- with gr.Row():
235
- with gr.Column():
236
- file_input = gr.File(
237
- label="Upload Document",
238
- file_types=[".pdf", ".docx", ".txt"]
239
- )
240
- doc_source_lang = gr.Dropdown(
241
- choices=list(set(lang.split('-')[0] for lang in LANGUAGE_PAIRS.keys())),
242
- value="English",
243
- label="Source Language"
244
- )
245
- doc_target_lang = gr.Dropdown(
246
- choices=list(set(lang.split('-')[1] for lang in LANGUAGE_PAIRS.keys())),
247
- value="Hindi",
248
- label="Target Language"
249
- )
250
- translate_doc_btn = gr.Button("Translate Document")
251
-
252
- with gr.Column():
253
- output_file = gr.File(label="Translated PDF")
254
- output_preview = gr.Textbox(
255
- label="Translation Preview",
256
- lines=8
257
- )
258
 
259
- # Set up event handlers
260
- translate_btn.click(
261
- fn=translate_text,
262
- inputs=[input_text, source_lang, target_lang],
263
- outputs=output_text
264
  )
265
-
266
- translate_doc_btn.click(
267
- fn=process_document,
268
- inputs=[file_input, doc_source_lang, doc_target_lang],
269
- outputs=[output_file, output_preview]
 
270
  )
271
 
272
- if __name__ == "__main__":
273
- demo.launch(share=True)
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
3
+ import fitz # PyMuPDF for PDF handling
4
+ from io import BytesIO
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Load IndicTrans model for Indian languages (example: English to Hindi)
7
+ model_name = "ai4bharat/indictrans2-en-indic-1b" # Supports multiple Indian languages
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
+ translator = pipeline("translation", model=model, tokenizer=tokenizer)
 
 
 
 
11
 
12
+ # Language mapping for Indian languages and others
13
+ language_map = {
14
+ "French": "fr", # Using OPUS model for non-Indian languages
15
+ "Spanish": "es",
16
+ "German": "de",
17
+ "Hindi": "hi",
18
+ "Tamil": "ta",
19
+ "Telugu": "te",
20
+ "Bengali": "bn",
21
+ "Gujarati": "gu",
22
+ "Marathi": "mr",
23
+ "Kannada": "kn",
24
+ "Malayalam": "ml",
25
+ "Punjabi": "pa",
26
+ }
27
 
28
+ def translate_text(input_text, target_language):
29
+ """Translate text with context awareness"""
30
+ target_lang_code = language_map[target_language]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # For Indian languages, use IndicTrans
33
+ if target_lang_code in ["hi", "ta", "te", "bn", "gu", "mr", "kn", "ml", "pa"]:
34
+ translated = translator(input_text, src_lang="en", tgt_lang=target_lang_code)[0]['translation_text']
35
+ else:
36
+ # For non-Indian languages, switch to OPUS model (example: English to French)
37
+ opus_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_lang_code}")
38
+ translated = opus_translator(input_text)[0]['translation_text']
39
+
40
+ return translated
41
 
42
+ def process_input(input_data, target_language):
43
+ """Handle both text and document inputs"""
44
+ if isinstance(input_data, str): # Direct text input
45
+ text = input_data
46
+ else: # File input (assuming text-based document)
47
+ doc = fitz.open(stream=input_data.read(), filetype="pdf")
48
+ text = ""
49
  for page in doc:
50
+ text += page.get_text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Translate the extracted text
53
+ translated_text = translate_text(text, target_language)
54
+
55
+ # Create PDF output
56
+ pdf_output = fitz.open()
57
+ page = pdf_output.new_page()
58
+ page.insert_text((50, 50), translated_text)
59
+
60
+ # Save PDF to bytes
61
+ pdf_bytes = BytesIO()
62
+ pdf_output.save(pdf_bytes)
63
+ pdf_bytes.seek(0)
64
+
65
+ return pdf_bytes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ # Gradio Interface
68
+ with gr.Blocks(title="Context-Aware Translator with Indian Languages") as demo:
69
+ gr.Markdown("# Context-Aware Language Translator")
70
+ gr.Markdown("Translate text or upload a document into Indian languages or others, and get a PDF output.")
71
 
72
+ with gr.Row():
73
+ with gr.Column():
74
+ input_type = gr.Radio(["Text", "Document"], label="Input Type", value="Text")
75
+ text_input = gr.Textbox(lines=5, label="Enter Text", visible=True)
76
+ file_input = gr.File(label="Upload Document", visible=False)
77
+ target_lang = gr.Dropdown(
78
+ choices=list(language_map.keys()),
79
+ label="Target Language",
80
+ value="Hindi"
81
+ )
82
+ submit_btn = gr.Button("Translate")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ with gr.Column():
85
+ output_pdf = gr.File(label="Download Translated PDF")
86
+
87
+ # Dynamic visibility based on input type
88
+ def update_visibility(choice):
89
+ return (
90
+ gr.update(visible=(choice == "Text")),
91
+ gr.update(visible=(choice == "Document"))
92
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ input_type.change(
95
+ fn=update_visibility,
96
+ inputs=input_type,
97
+ outputs=[text_input, file_input]
 
98
  )
99
+
100
+ # Process the input and generate output
101
+ submit_btn.click(
102
+ fn=process_input,
103
+ inputs=[gr.State(value=None, _js="() => document.querySelector('input[name=\"input_type\"]:checked').value === 'Text' ? document.querySelector('#text_input textarea').value : document.querySelector('#file_input input').files[0]"), target_lang],
104
+ outputs=output_pdf
105
  )
106
 
107
+ demo.launch()