akarshan11 commited on
Commit
895c980
·
verified ·
1 Parent(s): e1983d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -153
app.py CHANGED
@@ -1,174 +1,207 @@
1
- import gradio as gr
2
- import sys
3
- import pkg_resources
4
- import tempfile
5
  import os
6
- from pathlib import Path
 
 
 
 
 
7
 
8
- def check_dependencies():
9
- required_packages = {
10
- 'gradio': ['gradio'],
11
- 'transformers': ['transformers'],
12
- 'python-docx': ['python-docx', 'python_docx', 'docx'],
13
- 'PyPDF2': ['PyPDF2', 'pypdf2', 'pypdf'],
14
- 'torch': ['torch'],
15
- 'sentencepiece': ['sentencepiece'],
16
- 'tf-keras': ['tf-keras']
17
- }
18
-
19
- installed = {pkg.key.lower() for pkg in pkg_resources.working_set}
20
- missing = []
21
-
22
- for package, variations in required_packages.items():
23
- if not any(variation.lower() in installed for variation in variations):
24
- missing.append(package)
 
 
 
25
 
26
- if missing:
27
- print("Missing required packages. Please install:")
28
- for pkg in missing:
29
- print(f"pip install {pkg}")
30
- sys.exit(1)
 
 
 
 
 
 
 
31
 
32
- # Check dependencies before importing
33
- check_dependencies()
 
 
 
 
 
 
 
 
 
34
 
35
- import torch
36
- from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
37
- import docx
38
- import PyPDF2
39
- import io
 
40
 
41
- class DocumentTranslator:
42
- def __init__(self):
 
 
 
 
43
  try:
44
- # Initialize translation models
45
- self.romance_translator = pipeline(
46
- "translation",
47
- model="Helsinki-NLP/opus-mt-en-ROMANCE",
48
- framework="pt"
49
- )
50
-
51
- # Initialize Hindi translator
52
- self.hindi_translator = pipeline(
53
- "translation",
54
- model="Helsinki-NLP/opus-mt-en-hi",
55
- framework="pt"
56
- )
57
-
58
- # Supported languages
59
- self.languages = {
60
- "English": "en",
61
- "French": "fr",
62
- "Spanish": "es",
63
- "Portuguese": "pt",
64
- "Italian": "it",
65
- "Hindi": "hi" # Added Hindi support
66
- }
67
  except Exception as e:
68
- print(f"Error initializing translator: {str(e)}")
69
- print("Please make sure all required packages are installed:")
70
- print("pip install transformers torch sentencepiece python-docx PyPDF2 gradio tf-keras")
71
- raise
72
-
73
- def extract_text_from_docx(self, file):
74
- doc = docx.Document(file)
75
- text = []
76
- for paragraph in doc.paragraphs:
77
- text.append(paragraph.text)
78
- return "\n".join(text)
79
 
80
- def extract_text_from_pdf(self, file):
81
- pdf_reader = PyPDF2.PdfReader(file)
82
- text = []
83
- for page in pdf_reader.pages:
84
- text.append(page.extract_text())
85
- return "\n".join(text)
 
 
 
 
 
 
 
86
 
87
- def create_translated_docx(self, original_text, translated_text, output_filename):
88
- doc = docx.Document()
89
- paragraphs = translated_text.split("\n")
90
- for para in paragraphs:
91
- if para.strip():
92
- doc.add_paragraph(para)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- doc.save(output_filename)
95
- return output_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- def translate_text(self, text, target_lang):
98
- # Choose appropriate translator based on target language
99
- if target_lang == "hi":
100
- return self.hindi_translator(text)[0]['translation_text']
 
 
 
 
 
 
 
 
 
 
101
  else:
102
- return self.romance_translator(text)[0]['translation_text']
103
-
104
- def translate_document(self, file, source_lang, target_lang):
105
- try:
106
- # Create temporary directory for output
107
- temp_dir = tempfile.mkdtemp()
108
- output_filename = os.path.join(temp_dir, "translated_document.docx")
109
-
110
- # Extract text based on file type
111
- if file.name.endswith('.docx'):
112
- text = self.extract_text_from_docx(file)
113
- elif file.name.endswith('.pdf'):
114
- text = self.extract_text_from_pdf(file)
115
- else:
116
- return None, "Unsupported file format. Please use .docx or .pdf"
117
 
118
- # Split text into chunks to handle long documents
119
- chunk_size = 500
120
- chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
121
-
122
- # Translate chunks
123
- translated_chunks = []
124
- for chunk in chunks:
125
- translation = self.translate_text(chunk, self.languages[target_lang])
126
- translated_chunks.append(translation)
 
127
 
128
- translated_text = " ".join(translated_chunks)
 
 
129
 
130
- # Create new document with translation
131
- output_file = self.create_translated_docx(text, translated_text, output_filename)
 
 
 
 
 
 
 
 
 
132
 
133
- return output_file, "Translation completed successfully!"
 
 
 
134
 
135
- except Exception as e:
136
- return None, f"Error during translation: {str(e)}"
137
-
138
- def create_translation_interface():
139
- try:
140
- translator = DocumentTranslator()
141
-
142
- def translate_file(file, source_lang, target_lang):
143
- if file is None:
144
- return None, "Please upload a file"
145
- return translator.translate_document(file, source_lang, target_lang)
146
-
147
- iface = gr.Interface(
148
- fn=translate_file,
149
- inputs=[
150
- gr.File(label="Upload Document (.docx or .pdf)"),
151
- gr.Dropdown(choices=list(translator.languages.keys()), label="Source Language"),
152
- gr.Dropdown(choices=list(translator.languages.keys()), label="Target Language")
153
- ],
154
- outputs=[
155
- gr.File(label="Download Translated Document"),
156
- gr.Textbox(label="Status")
157
- ],
158
- title="Document Translation System",
159
- description="Upload a document (.docx or .pdf) and select source and target languages for translation.",
160
- theme="default"
161
- )
162
 
163
- return iface
164
- except Exception as e:
165
- print(f"Error creating interface: {str(e)}")
166
- sys.exit(1)
 
 
 
167
 
 
168
  if __name__ == "__main__":
169
- print("Initializing translation system...")
170
- print("Checking dependencies...")
171
- check_dependencies()
172
- print("Starting Gradio interface...")
173
- iface = create_translation_interface()
174
- iface.launch(share=True)
 
 
 
 
 
1
  import os
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
+ import fitz # PyMuPDF for PDF processing
6
+ import docx2txt # For DOCX processing
7
+ from fpdf import FPDF # For creating PDF outputs
8
 
9
+ # Load model and tokenizer
10
+ model_name = "facebook/mbart-large-50-many-to-many-mmt" # Powerful translation model that can handle idioms well
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
+
14
+ # Set device
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ model = model.to(device)
17
+
18
+ # Reduced language list with focus on major languages and Indian languages
19
+ LANGUAGES = {
20
+ # Major Global Languages
21
+ "English": "en_XX",
22
+ "Spanish": "es_XX",
23
+ "French": "fr_XX",
24
+ "German": "de_DE",
25
+ "Russian": "ru_RU",
26
+ "Chinese": "zh_CN",
27
+ "Japanese": "ja_XX",
28
+ "Arabic": "ar_AR",
29
 
30
+ # Major Indian Languages
31
+ "Hindi": "hi_IN",
32
+ "Bengali": "bn_IN",
33
+ "Gujarati": "gu_IN",
34
+ "Marathi": "mr_IN",
35
+ "Tamil": "ta_IN",
36
+ "Telugu": "te_IN",
37
+ "Malayalam": "ml_IN",
38
+ "Punjabi": "pa_IN", # Note: Using closest available in mBART
39
+ "Kannada": "kn_IN", # Note: Using closest available in mBART
40
+ "Urdu": "ur_PK"
41
+ }
42
 
43
+ # File extraction functions
44
+ def extract_text_from_pdf(file_path):
45
+ """Extract text from a PDF file"""
46
+ text = ""
47
+ try:
48
+ doc = fitz.open(file_path)
49
+ for page in doc:
50
+ text += page.get_text()
51
+ return text
52
+ except Exception as e:
53
+ return f"Error extracting PDF text: {str(e)}"
54
 
55
+ def extract_text_from_docx(file_path):
56
+ """Extract text from a DOCX file"""
57
+ try:
58
+ return docx2txt.process(file_path)
59
+ except Exception as e:
60
+ return f"Error extracting DOCX text: {str(e)}"
61
 
62
+ def extract_text_from_txt(file_path):
63
+ """Extract text from a TXT file"""
64
+ try:
65
+ with open(file_path, 'r', encoding='utf-8') as file:
66
+ return file.read()
67
+ except UnicodeDecodeError:
68
  try:
69
+ with open(file_path, 'r', encoding='latin-1') as file:
70
+ return file.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
+ return f"Error extracting TXT text: {str(e)}"
73
+ except Exception as e:
74
+ return f"Error extracting TXT text: {str(e)}"
 
 
 
 
 
 
 
 
75
 
76
+ def save_as_pdf(text, output_path):
77
+ """Save text as PDF"""
78
+ pdf = FPDF()
79
+ pdf.add_page()
80
+ pdf.set_font("Arial", size=12)
81
+
82
+ # Split text into lines and add to PDF
83
+ # Encode to handle unicode characters
84
+ encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
85
+ pdf.multi_cell(0, 10, encoded_text)
86
+
87
+ pdf.output(output_path)
88
+ return output_path
89
 
90
+ # Translation function
91
+ def translate(text, source_lang, target_lang, max_length=1024):
92
+ """Translate text from source language to target language"""
93
+ if not text:
94
+ return "No text provided for translation."
95
+
96
+ try:
97
+ # Set source and target language
98
+ src_lang = LANGUAGES.get(source_lang)
99
+ tgt_lang = LANGUAGES.get(target_lang)
100
+
101
+ if not src_lang or not tgt_lang:
102
+ return "Source or target language not supported."
103
+
104
+ # Set tokenizer source language
105
+ tokenizer.src_lang = src_lang
106
+
107
+ # Prepare input
108
+ inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
109
+ inputs = {k: v.to(device) for k, v in inputs.items()}
110
 
111
+ # Generate translation
112
+ with torch.no_grad():
113
+ generated_tokens = model.generate(
114
+ **inputs,
115
+ forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
116
+ max_length=max_length,
117
+ num_beams=5,
118
+ early_stopping=True
119
+ )
120
+
121
+ # Decode translation
122
+ translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
123
+ return translation
124
+
125
+ except Exception as e:
126
+ return f"Translation error: {str(e)}"
127
 
128
+ # Process uploads and handle translation
129
+ def process_file(file, source_lang, target_lang):
130
+ """Process uploaded file and translate its content"""
131
+ try:
132
+ # Save uploaded file temporarily
133
+ temp_file_path = file.name
134
+
135
+ # Extract text based on file type
136
+ if temp_file_path.lower().endswith('.pdf'):
137
+ text = extract_text_from_pdf(temp_file_path)
138
+ elif temp_file_path.lower().endswith('.docx'):
139
+ text = extract_text_from_docx(temp_file_path)
140
+ elif temp_file_path.lower().endswith('.txt'):
141
+ text = extract_text_from_txt(temp_file_path)
142
  else:
143
+ return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
144
+
145
+ # Translate the extracted text
146
+ translated_text = translate(text, source_lang, target_lang)
147
+
148
+ # Save translation as PDF
149
+ output_pdf_path = temp_file_path + "_translated.pdf"
150
+ save_as_pdf(translated_text, output_pdf_path)
151
+
152
+ return output_pdf_path, translated_text
153
+
154
+ except Exception as e:
155
+ return None, f"Error processing file: {str(e)}"
 
 
156
 
157
+ # Gradio interface
158
+ def gradio_interface():
159
+ with gr.Blocks(title="Indian Language Translator", theme=gr.themes.Soft()) as interface:
160
+ gr.Markdown("# Indian & Global Language Translator")
161
+ gr.Markdown("Translate text with understanding of idioms and cultural expressions")
162
+
163
+ with gr.Tab("Text Translation"):
164
+ with gr.Row():
165
+ source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
166
+ target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
167
 
168
+ with gr.Row():
169
+ input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
170
+ output_text = gr.Textbox(label="Translation", lines=5)
171
 
172
+ translate_btn = gr.Button("Translate Text", variant="primary")
173
+ translate_btn.click(
174
+ fn=translate,
175
+ inputs=[input_text, source_lang_text, target_lang_text],
176
+ outputs=output_text
177
+ )
178
+
179
+ with gr.Tab("Document Translation"):
180
+ with gr.Row():
181
+ source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
182
+ target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
183
 
184
+ file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
185
+ with gr.Row():
186
+ output_file = gr.File(label="Translated PDF")
187
+ output_preview = gr.Textbox(label="Translation Preview", lines=8)
188
 
189
+ translate_doc_btn = gr.Button("Translate Document", variant="primary")
190
+ translate_doc_btn.click(
191
+ fn=process_file,
192
+ inputs=[file_input, source_lang_doc, target_lang_doc],
193
+ outputs=[output_file, output_preview]
194
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
197
+ gr.Markdown("### Features:")
198
+ gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
199
+ gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
200
+ gr.Markdown("- Document translation with PDF output")
201
+
202
+ return interface
203
 
204
+ # Launch the application
205
  if __name__ == "__main__":
206
+ app = gradio_interface()
207
+ app.launch(share=True) # Remove share=True in production