akarshan11 commited on
Commit
476dd48
·
verified ·
1 Parent(s): 14515e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -180
app.py CHANGED
@@ -2,225 +2,117 @@ import os
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
- import fitz # PyMuPDF for PDF processing
6
- import docx2txt # For DOCX processing
7
- from fpdf import FPDF # For creating PDF outputs
8
 
9
- # Load model and tokenizer
10
- model_name = "facebook/mbart-large-50-many-to-many-mmt"
11
- tokenizer = AutoTokenizer.from_pretrained(model_name)
12
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
-
14
- # Set device
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
- model = model.to(device)
17
-
18
- # Language mappings
19
  LANGUAGES = {
20
- # Major Global Languages
21
  "English": "en_XX",
22
- "Spanish": "es_XX",
23
- "French": "fr_XX",
24
- "German": "de_DE",
25
- "Russian": "ru_RU",
26
- "Chinese": "zh_CN",
27
- "Japanese": "ja_XX",
28
- "Arabic": "ar_AR",
29
-
30
- # Major Indian Languages
31
  "Hindi": "hi_IN",
32
  "Bengali": "bn_IN",
33
- "Gujarati": "gu_IN",
34
- "Marathi": "mr_IN",
35
  "Tamil": "ta_IN",
36
  "Telugu": "te_IN",
37
  "Malayalam": "ml_IN",
38
  "Urdu": "ur_PK"
39
  }
40
 
41
- # Define translation function first
42
- def translate(text: str, source_lang: str, target_lang: str, max_length: int = 1024) -> str:
43
- """
44
- Translate text from source language to target language.
45
-
46
- Args:
47
- text: Text to translate
48
- source_lang: Source language name
49
- target_lang: Target language name
50
- max_length: Maximum length of input text
51
-
52
- Returns:
53
- str: Translated text
54
- """
 
 
55
  if not text:
56
- return "No text provided for translation."
57
 
58
  try:
 
 
59
  # Get language codes
60
  src_lang = LANGUAGES.get(source_lang)
61
  tgt_lang = LANGUAGES.get(target_lang)
62
 
63
- if not src_lang or not tgt_lang:
64
- return "Source or target language not supported."
65
-
66
- # Set tokenizer source language
67
  tokenizer.src_lang = src_lang
68
 
69
- # Prepare input
70
- inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True)
71
- inputs = {k: v.to(device) for k, v in inputs.items()}
 
72
 
73
  # Generate translation
74
  with torch.no_grad():
75
  generated_tokens = model.generate(
76
  **inputs,
77
  forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
78
- max_length=max_length,
79
- num_beams=5,
80
  early_stopping=True
81
  )
82
 
83
- # Decode translation
84
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
85
  return translation
86
 
87
  except Exception as e:
88
- return f"Translation error: {str(e)}"
89
-
90
- # File handling functions
91
- def extract_text_from_pdf(file_path: str) -> str:
92
- """Extract text from a PDF file"""
93
- text = ""
94
- try:
95
- doc = fitz.open(file_path)
96
- for page in doc:
97
- text += page.get_text()
98
- return text
99
- except Exception as e:
100
- return f"Error extracting PDF text: {str(e)}"
101
-
102
- def extract_text_from_docx(file_path: str) -> str:
103
- """Extract text from a DOCX file"""
104
- try:
105
- return docx2txt.process(file_path)
106
- except Exception as e:
107
- return f"Error extracting DOCX text: {str(e)}"
108
-
109
- def extract_text_from_txt(file_path: str) -> str:
110
- """Extract text from a TXT file"""
111
- try:
112
- with open(file_path, 'r', encoding='utf-8') as file:
113
- return file.read()
114
- except UnicodeDecodeError:
115
- try:
116
- with open(file_path, 'r', encoding='latin-1') as file:
117
- return file.read()
118
- except Exception as e:
119
- return f"Error extracting TXT text: {str(e)}"
120
- except Exception as e:
121
- return f"Error extracting TXT text: {str(e)}"
122
-
123
- def save_as_pdf(text: str, output_path: str) -> str:
124
- """Save text as PDF"""
125
- pdf = FPDF()
126
- pdf.add_page()
127
- pdf.set_font("Arial", size=12)
128
-
129
- try:
130
- # Try UTF-8 first
131
- pdf.multi_cell(0, 10, text)
132
- except Exception:
133
- try:
134
- # Fall back to latin-1 with replacement
135
- encoded_text = text.encode('latin-1', 'replace').decode('latin-1')
136
- pdf.multi_cell(0, 10, encoded_text)
137
- except Exception as e:
138
- return f"Error creating PDF: {str(e)}"
139
-
140
- try:
141
- pdf.output(output_path)
142
- return output_path
143
- except Exception as e:
144
- return f"Error saving PDF: {str(e)}"
145
-
146
- def process_file(file, source_lang: str, target_lang: str) -> tuple[str | None, str]:
147
- """Process uploaded file and translate its content"""
148
- if file is None:
149
- return None, "No file uploaded."
150
-
151
- try:
152
- # Save uploaded file temporarily
153
- temp_file_path = file.name
154
-
155
- # Extract text based on file type
156
- if temp_file_path.lower().endswith('.pdf'):
157
- text = extract_text_from_pdf(temp_file_path)
158
- elif temp_file_path.lower().endswith('.docx'):
159
- text = extract_text_from_docx(temp_file_path)
160
- elif temp_file_path.lower().endswith('.txt'):
161
- text = extract_text_from_txt(temp_file_path)
162
- else:
163
- return None, "Unsupported file format. Please upload PDF, DOCX, or TXT files."
164
-
165
- # Translate the extracted text
166
- translated_text = translate(text, source_lang, target_lang)
167
-
168
- # Save translation as PDF
169
- output_pdf_path = os.path.join(os.path.dirname(temp_file_path),
170
- f"translated_{os.path.basename(temp_file_path)}.pdf")
171
- result = save_as_pdf(translated_text, output_pdf_path)
172
-
173
- if isinstance(result, str) and result.startswith("Error"):
174
- return None, result
175
-
176
- return output_pdf_path, translated_text
177
-
178
- except Exception as e:
179
- return None, f"Error processing file: {str(e)}"
180
 
181
  # Create Gradio interface
182
- with gr.Blocks(title="Indian Language Translator") as demo:
183
- gr.Markdown("# Indian & Global Language Translator")
184
- gr.Markdown("Translate text with understanding of idioms and cultural expressions")
185
 
186
- with gr.Tab("Text Translation"):
187
- with gr.Row():
188
- source_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
189
- target_lang_text = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
190
-
191
- with gr.Row():
192
- input_text = gr.Textbox(label="Enter text to translate", lines=5, placeholder="Type or paste text here...")
193
- output_text = gr.Textbox(label="Translation", lines=5)
 
 
 
 
 
 
 
 
 
 
194
 
195
- translate_btn = gr.Button("Translate Text", variant="primary")
196
- translate_btn.click(
197
- fn=translate,
198
- inputs=[input_text, source_lang_text, target_lang_text],
199
- outputs=output_text
200
- )
201
 
202
- with gr.Tab("Document Translation"):
203
- with gr.Row():
204
- source_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="English", label="Source Language")
205
- target_lang_doc = gr.Dropdown(list(LANGUAGES.keys()), value="Hindi", label="Target Language")
206
-
207
- file_input = gr.File(label="Upload Document (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
208
- with gr.Row():
209
- output_file = gr.File(label="Translated PDF")
210
- output_preview = gr.Textbox(label="Translation Preview", lines=8)
211
-
212
- translate_doc_btn = gr.Button("Translate Document", variant="primary")
213
- translate_doc_btn.click(
214
- fn=process_file,
215
- inputs=[file_input, source_lang_doc, target_lang_doc],
216
- outputs=[output_file, output_preview]
217
- )
218
 
219
- gr.Markdown("### Supported File Types: PDF, DOCX, TXT")
220
- gr.Markdown("### Features:")
221
- gr.Markdown("- Supports major Indian languages including Hindi, Bengali, Tamil, Telugu, Malayalam")
222
- gr.Markdown("- Context-aware translation that understands idioms and cultural expressions")
223
- gr.Markdown("- Document translation with PDF output")
 
 
 
 
 
 
224
 
225
  if __name__ == "__main__":
226
  demo.launch(share=True)
 
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 
5
 
6
+ # First, let's create a simpler interface without complex schema handling
7
+ # Define languages
 
 
 
 
 
 
 
 
8
  LANGUAGES = {
 
9
  "English": "en_XX",
 
 
 
 
 
 
 
 
 
10
  "Hindi": "hi_IN",
11
  "Bengali": "bn_IN",
 
 
12
  "Tamil": "ta_IN",
13
  "Telugu": "te_IN",
14
  "Malayalam": "ml_IN",
15
  "Urdu": "ur_PK"
16
  }
17
 
18
+ # Initialize model and tokenizer
19
+ model_name = "facebook/mbart-large-50-many-to-many-mmt"
20
+ tokenizer = None
21
+ model = None
22
+
23
+ def load_model():
24
+ global tokenizer, model
25
+ if tokenizer is None:
26
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
27
+ if model is None:
28
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
29
+ if torch.cuda.is_available():
30
+ model = model.to("cuda")
31
+
32
+ def translate_text(text, source_lang, target_lang):
33
+ """Simple translation function"""
34
  if not text:
35
+ return "Please enter some text to translate."
36
 
37
  try:
38
+ load_model()
39
+
40
  # Get language codes
41
  src_lang = LANGUAGES.get(source_lang)
42
  tgt_lang = LANGUAGES.get(target_lang)
43
 
44
+ # Set source language
 
 
 
45
  tokenizer.src_lang = src_lang
46
 
47
+ # Tokenize
48
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
49
+ if torch.cuda.is_available():
50
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
51
 
52
  # Generate translation
53
  with torch.no_grad():
54
  generated_tokens = model.generate(
55
  **inputs,
56
  forced_bos_token_id=tokenizer.lang_to_id[tgt_lang],
57
+ max_length=512,
58
+ num_beams=4,
59
  early_stopping=True
60
  )
61
 
62
+ # Decode
63
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
64
  return translation
65
 
66
  except Exception as e:
67
+ return f"Translation Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Create Gradio interface
70
+ with gr.Blocks(title="Simple Language Translator") as demo:
71
+ gr.Markdown("# Simple Language Translator")
 
72
 
73
+ with gr.Row():
74
+ with gr.Column():
75
+ input_text = gr.Textbox(
76
+ label="Input Text",
77
+ placeholder="Enter text to translate...",
78
+ lines=5
79
+ )
80
+ source_lang = gr.Dropdown(
81
+ choices=list(LANGUAGES.keys()),
82
+ value="English",
83
+ label="Source Language"
84
+ )
85
+ target_lang = gr.Dropdown(
86
+ choices=list(LANGUAGES.keys()),
87
+ value="Hindi",
88
+ label="Target Language"
89
+ )
90
+ translate_btn = gr.Button("Translate")
91
 
92
+ with gr.Column():
93
+ output_text = gr.Textbox(
94
+ label="Translation",
95
+ lines=5
96
+ )
 
97
 
98
+ # Set up translation event
99
+ translate_btn.click(
100
+ fn=translate_text,
101
+ inputs=[input_text, source_lang, target_lang],
102
+ outputs=output_text
103
+ )
 
 
 
 
 
 
 
 
 
 
104
 
105
+ # Add examples
106
+ gr.Examples(
107
+ examples=[
108
+ ["Hello, how are you?", "English", "Hindi"],
109
+ ["नमस्ते, कैसे हैं आप?", "Hindi", "English"],
110
+ ],
111
+ inputs=[input_text, source_lang, target_lang],
112
+ outputs=output_text,
113
+ fn=translate_text,
114
+ cache_examples=True,
115
+ )
116
 
117
  if __name__ == "__main__":
118
  demo.launch(share=True)