akarshan11 commited on
Commit
d98542e
·
verified ·
1 Parent(s): 1096d31

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +159 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import sys
3
+ import pkg_resources
4
+ import tempfile
5
+ import os
6
+ from pathlib import Path
7
+
8
+
9
+ def check_dependencies():
10
+ required_packages = {
11
+ 'gradio': ['gradio'],
12
+ 'transformers': ['transformers'],
13
+ 'python-docx': ['python-docx', 'python_docx', 'docx'],
14
+ 'PyPDF2': ['PyPDF2', 'pypdf2', 'pypdf'],
15
+ 'torch': ['torch'],
16
+ 'sentencepiece': ['sentencepiece']
17
+ }
18
+
19
+ installed = {pkg.key.lower() for pkg in pkg_resources.working_set}
20
+ missing = []
21
+
22
+ for package, variations in required_packages.items():
23
+ if not any(variation.lower() in installed for variation in variations):
24
+ missing.append(package)
25
+
26
+ if missing:
27
+ print("Missing required packages. Please install:")
28
+ for pkg in missing:
29
+ print(f"pip install {pkg}")
30
+ sys.exit(1)
31
+
32
+
33
+ # Check dependencies before importing
34
+ check_dependencies()
35
+
36
+ from transformers import pipeline
37
+ import docx
38
+ import PyPDF2
39
+ import io
40
+
41
+
42
+ class DocumentTranslator:
43
+ def __init__(self):
44
+ try:
45
+ # Initialize translation model
46
+ self.translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ROMANCE")
47
+
48
+ # Supported languages
49
+ self.languages = {
50
+ "English": "en",
51
+ "French": "fr",
52
+ "Spanish": "es",
53
+ "Portuguese": "pt",
54
+ "Italian": "it"
55
+ }
56
+ except Exception as e:
57
+ print(f"Error initializing translator: {str(e)}")
58
+ print("Please make sure all required packages are installed:")
59
+ print("pip install transformers torch sentencepiece python-docx PyPDF2 gradio")
60
+ raise
61
+
62
+ def extract_text_from_docx(self, file):
63
+ doc = docx.Document(file)
64
+ text = []
65
+ for paragraph in doc.paragraphs:
66
+ text.append(paragraph.text)
67
+ return "\n".join(text)
68
+
69
+ def extract_text_from_pdf(self, file):
70
+ pdf_reader = PyPDF2.PdfReader(file)
71
+ text = []
72
+ for page in pdf_reader.pages:
73
+ text.append(page.extract_text())
74
+ return "\n".join(text)
75
+
76
+ def create_translated_docx(self, original_text, translated_text, output_filename):
77
+ doc = docx.Document()
78
+ paragraphs = translated_text.split("\n")
79
+ for para in paragraphs:
80
+ if para.strip():
81
+ doc.add_paragraph(para)
82
+
83
+ doc.save(output_filename)
84
+ return output_filename
85
+
86
+ def translate_document(self, file, source_lang, target_lang):
87
+ try:
88
+ # Create temporary directory for output
89
+ temp_dir = tempfile.mkdtemp()
90
+ output_filename = os.path.join(temp_dir, "translated_document.docx")
91
+
92
+ # Extract text based on file type
93
+ if file.name.endswith('.docx'):
94
+ text = self.extract_text_from_docx(file)
95
+ elif file.name.endswith('.pdf'):
96
+ text = self.extract_text_from_pdf(file)
97
+ else:
98
+ return None, "Unsupported file format. Please use .docx or .pdf"
99
+
100
+ # Split text into chunks to handle long documents
101
+ chunk_size = 500
102
+ chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
103
+
104
+ # Translate chunks
105
+ translated_chunks = []
106
+ for chunk in chunks:
107
+ translation = self.translator(chunk)[0]['translation_text']
108
+ translated_chunks.append(translation)
109
+
110
+ translated_text = " ".join(translated_chunks)
111
+
112
+ # Create new document with translation
113
+ output_file = self.create_translated_docx(text, translated_text, output_filename)
114
+
115
+ return output_file, "Translation completed successfully!"
116
+
117
+ except Exception as e:
118
+ return None, f"Error during translation: {str(e)}"
119
+
120
+
121
+ # Create Gradio interface
122
+ def create_translation_interface():
123
+ try:
124
+ translator = DocumentTranslator()
125
+
126
+ def translate_file(file, source_lang, target_lang):
127
+ if file is None:
128
+ return None, "Please upload a file"
129
+ return translator.translate_document(file, source_lang, target_lang)
130
+
131
+ iface = gr.Interface(
132
+ fn=translate_file,
133
+ inputs=[
134
+ gr.File(label="Upload Document (.docx or .pdf)"),
135
+ gr.Dropdown(choices=list(translator.languages.keys()), label="Source Language"),
136
+ gr.Dropdown(choices=list(translator.languages.keys()), label="Target Language")
137
+ ],
138
+ outputs=[
139
+ gr.File(label="Download Translated Document"),
140
+ gr.Textbox(label="Status")
141
+ ],
142
+ title="Document Translation System",
143
+ description="Upload a document (.docx or .pdf) and select source and target languages for translation.",
144
+ theme="default"
145
+ )
146
+
147
+ return iface
148
+ except Exception as e:
149
+ print(f"Error creating interface: {str(e)}")
150
+ sys.exit(1)
151
+
152
+
153
+ if __name__ == "__main__":
154
+ print("Initializing translation system...")
155
+ print("Checking dependencies...")
156
+ check_dependencies()
157
+ print("Starting Gradio interface...")
158
+ iface = create_translation_interface()
159
+ iface.launch(share=True) # Added share=True to create a public link
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ soundfile
4
+ gradio
5
+ pydub
6
+ ffmpeg-python
7
+ soundfile