Berzelius255 commited on
Commit
630de19
·
verified ·
1 Parent(s): dd53d59

Uploaded 3 files

Browse files
Files changed (3) hide show
  1. packages.txt +2 -0
  2. polyglot.py +496 -0
  3. requirements-txt.txt +29 -0
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libgl1-mesa-glx
2
+ libglib2.0-0
polyglot.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_option_menu import option_menu
3
+ import whisper
4
+ from googletrans import Translator
5
+ import googletrans
6
+ from gtts import gTTS
7
+ from io import BytesIO
8
+ import os
9
+ import tempfile
10
+ from PIL import Image
11
+ import easyocr
12
+ import cv2
13
+ import numpy as np
14
+ import asyncio
15
+ import time
16
+
17
+ # Page configuration
18
+ st.set_page_config(
19
+ page_title="Polyglot - Multilingual Communication Suite",
20
+ page_icon="🌍",
21
+ layout="wide",
22
+ initial_sidebar_state="expanded"
23
+ )
24
+
25
+ # Custom CSS for better UI
26
+ st.markdown("""
27
+ <style>
28
+ .main-header {
29
+ font-size: 2.5rem;
30
+ font-weight: 700;
31
+ color: #1E88E5;
32
+ margin-bottom: 1rem;
33
+ text-align: center;
34
+ }
35
+ .sub-header {
36
+ font-size: 1.5rem;
37
+ font-weight: 500;
38
+ color: #424242;
39
+ margin-bottom: 1rem;
40
+ }
41
+ .card {
42
+ padding: 1.5rem;
43
+ border-radius: 0.7rem;
44
+ background-color: #f8f9fa;
45
+ box-shadow: 0 0.15rem 1.75rem 0 rgba(58, 59, 69, 0.15);
46
+ margin-bottom: 1.5rem;
47
+ }
48
+ .action-button {
49
+ background-color: #1E88E5;
50
+ color: white;
51
+ border-radius: 0.3rem;
52
+ padding: 0.5rem 1rem;
53
+ text-align: center;
54
+ text-decoration: none;
55
+ display: inline-block;
56
+ font-size: 1rem;
57
+ margin: 0.5rem 0;
58
+ cursor: pointer;
59
+ }
60
+ .stProgress .st-eb {
61
+ background-color: #1E88E5;
62
+ }
63
+ .stTabs [data-baseweb="tab-list"] {
64
+ gap: 1px;
65
+ }
66
+ .stTabs [data-baseweb="tab"] {
67
+ background-color: #F0F2F6;
68
+ border-radius: 4px 4px 0 0;
69
+ border: none;
70
+ padding: 10px 16px;
71
+ color: #424242;
72
+ }
73
+ .stTabs [aria-selected="true"] {
74
+ background-color: #1E88E5 !important;
75
+ color: white !important;
76
+ }
77
+ .stMarkdown a {
78
+ color: #1E88E5;
79
+ }
80
+ footer {visibility: hidden;}
81
+ </style>
82
+ """, unsafe_allow_html=True)
83
+
84
+ # Initialize app states
85
+ if "transcription" not in st.session_state:
86
+ st.session_state.transcription = ""
87
+ if "translated_text" not in st.session_state:
88
+ st.session_state.translated_text = ""
89
+ if "extracted_text" not in st.session_state:
90
+ st.session_state.extracted_text = ""
91
+ if "translated_ocr_text" not in st.session_state:
92
+ st.session_state.translated_ocr_text = ""
93
+ if "progress" not in st.session_state:
94
+ st.session_state.progress = 0
95
+ if "processing" not in st.session_state:
96
+ st.session_state.processing = False
97
+
98
+ # App Header
99
+ col1, col2, col3 = st.columns([1, 2, 1])
100
+ with col2:
101
+ st.markdown('<div class="main-header">🌍 Polyglot</div>', unsafe_allow_html=True)
102
+ st.markdown("""
103
+ <div style="text-align: center; margin-bottom: 2rem;">
104
+ Your all-in-one solution for transcription, translation, and text extraction
105
+ </div>
106
+ """, unsafe_allow_html=True)
107
+
108
+ # Sidebar with improved UI
109
+ with st.sidebar:
110
+ st.image("https://via.placeholder.com/150x150.png?text=Polyglot", width=150)
111
+ st.markdown("### Settings")
112
+
113
+ with st.expander("Translation Settings", expanded=True):
114
+ language = st.selectbox(
115
+ "Target Language",
116
+ options=list(googletrans.LANGUAGES.keys()),
117
+ format_func=lambda x: f"{googletrans.LANGUAGES[x].capitalize()} ({x})",
118
+ index=list(googletrans.LANGUAGES.keys()).index('en')
119
+ )
120
+
121
+ with st.expander("OCR Settings", expanded=True):
122
+ available_languages = ['en', 'es', 'fr', 'de', 'zh', 'ja', 'ko', 'ar', 'hi']
123
+ language_names = {
124
+ 'en': 'English', 'es': 'Spanish', 'fr': 'French',
125
+ 'de': 'German', 'zh': 'Chinese', 'ja': 'Japanese',
126
+ 'ko': 'Korean', 'ar': 'Arabic', 'hi': 'Hindi'
127
+ }
128
+
129
+ ocr_languages = st.multiselect(
130
+ "OCR Languages",
131
+ options=available_languages,
132
+ format_func=lambda x: language_names.get(x, x.capitalize()),
133
+ default=['en']
134
+ )
135
+
136
+ st.markdown("### About")
137
+ st.markdown("""
138
+ Polyglot helps break language barriers with AI-powered
139
+ translation and transcription tools.
140
+
141
+ **Features:**
142
+ - Audio transcription
143
+ - Text translation
144
+ - OCR text extraction
145
+ - Text-to-speech conversion
146
+ """)
147
+
148
+ # Initialize models (with loading spinners)
149
+ @st.cache_resource(show_spinner=False)
150
+ def load_whisper_model():
151
+ with st.spinner("Loading speech recognition model..."):
152
+ return whisper.load_model("base")
153
+
154
+ @st.cache_resource(show_spinner=False)
155
+ def load_ocr_reader(languages):
156
+ with st.spinner("Loading OCR model..."):
157
+ return easyocr.Reader(languages if languages else ['en'])
158
+
159
+ whisper_model = load_whisper_model()
160
+ translator = Translator()
161
+
162
+ # Navigation
163
+ icons = {"Audio Transcription": "🎤", "Image OCR": "📄", "Help": "❓"}
164
+ selected_tab = st.radio(
165
+ "Choose functionality:",
166
+ ["Audio Transcription", "Image OCR", "Help"],
167
+ format_func=lambda x: f"{icons[x]} {x}",
168
+ horizontal=True
169
+ )
170
+
171
+ # Helper functions
172
+ def transcribe_audio(audio_file):
173
+ """Transcribe audio using Whisper."""
174
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
175
+ temp_audio.write(audio_file.read())
176
+ result = whisper_model.transcribe(temp_audio.name)
177
+ os.remove(temp_audio.name)
178
+ return result['text']
179
+
180
+ def text_to_speech(text, lang='en'):
181
+ """Convert text to speech using gTTS."""
182
+ tts = gTTS(text=text, lang=lang)
183
+ audio_bytes = BytesIO()
184
+ tts.write_to_fp(audio_bytes)
185
+ return audio_bytes.getvalue()
186
+
187
+ def draw_boxes(image, results):
188
+ """Draw bounding boxes around detected text."""
189
+ image_np = np.array(image)
190
+ for (bbox, text, prob) in results:
191
+ # Unpack the bounding box
192
+ (top_left, top_right, bottom_right, bottom_left) = bbox
193
+ top_left = tuple(map(int, top_left))
194
+ bottom_right = tuple(map(int, bottom_right))
195
+
196
+ # Draw the bounding box
197
+ cv2.rectangle(image_np, top_left, bottom_right, (0, 0, 255), 2)
198
+
199
+ # Add text label with confidence score
200
+ label = f"{text} ({prob:.2f})"
201
+ cv2.putText(
202
+ image_np, label, (top_left[0], top_left[1] - 10),
203
+ cv2.FONT_HERSHEY_SIMPLEX, 1.4, (0, 0, 255), 2
204
+ )
205
+ return image_np
206
+
207
+ def simulate_progress():
208
+ """Simulate progress for better user experience."""
209
+ if st.session_state.processing:
210
+ progress_bar = st.progress(0)
211
+ for i in range(100):
212
+ time.sleep(0.01)
213
+ progress_bar.progress(i + 1)
214
+ st.session_state.processing = False
215
+ return progress_bar
216
+ return None
217
+
218
+ # Tab 1: Audio Transcription
219
+ if selected_tab == "Audio Transcription":
220
+ st.markdown('<div class="sub-header">🎤 Audio Transcription and Translation</div>', unsafe_allow_html=True)
221
+
222
+ col1, col2 = st.columns([1, 1])
223
+
224
+ with col1:
225
+ st.markdown('<div class="card">', unsafe_allow_html=True)
226
+ st.markdown("### Upload Audio")
227
+ st.write("Supported formats: WAV, MP3, M4A")
228
+
229
+ uploaded_audio = st.file_uploader(
230
+ "Drag and drop your audio file here",
231
+ type=["wav", "mp3", "m4a"],
232
+ key="audio_uploader"
233
+ )
234
+
235
+ if uploaded_audio:
236
+ st.audio(uploaded_audio, format=f"audio/{uploaded_audio.name.split('.')[-1]}")
237
+
238
+ if st.button("🔍 Transcribe Audio", key="transcribe_btn"):
239
+ st.session_state.processing = True
240
+ progress_bar = simulate_progress()
241
+
242
+ try:
243
+ st.session_state.transcription = transcribe_audio(uploaded_audio)
244
+ st.success("✅ Transcription complete!")
245
+ except Exception as e:
246
+ st.error(f"Error during transcription: {str(e)}")
247
+ st.markdown('</div>', unsafe_allow_html=True)
248
+
249
+ with col2:
250
+ st.markdown('<div class="card">', unsafe_allow_html=True)
251
+ st.markdown("### Results")
252
+
253
+ tabs = st.tabs(["Transcription", "Translation", "Audio Output"])
254
+
255
+ with tabs[0]:
256
+ if st.session_state.transcription:
257
+ st.text_area(
258
+ "Original Text:",
259
+ st.session_state.transcription,
260
+ height=200
261
+ )
262
+ else:
263
+ st.info("Transcribed text will appear here after processing.")
264
+
265
+ with tabs[1]:
266
+ if st.session_state.transcription:
267
+ if st.button("🌍 Translate Text", key="translate_btn"):
268
+ st.session_state.processing = True
269
+ progress_bar = simulate_progress()
270
+
271
+ try:
272
+ translation = asyncio.run(translator.translate(
273
+ st.session_state.transcription, dest=language
274
+ ))
275
+ st.session_state.translated_text = translation.text
276
+ st.success("✅ Translation complete!")
277
+ except Exception as e:
278
+ st.error(f"Error during translation: {str(e)}")
279
+
280
+ if st.session_state.translated_text:
281
+ st.text_area(
282
+ f"Translated to {googletrans.LANGUAGES[language].capitalize()}:",
283
+ st.session_state.translated_text,
284
+ height=200
285
+ )
286
+ else:
287
+ st.info("Translated text will appear here after processing.")
288
+
289
+ with tabs[2]:
290
+ if st.session_state.translated_text:
291
+ try:
292
+ audio_output = text_to_speech(st.session_state.translated_text, language)
293
+ st.audio(audio_output, format="audio/mp3")
294
+ st.download_button(
295
+ label="Download Audio",
296
+ data=audio_output,
297
+ file_name=f"translated_audio_{language}.mp3",
298
+ mime="audio/mp3"
299
+ )
300
+ except Exception as e:
301
+ st.error(f"Error generating speech: {str(e)}")
302
+ else:
303
+ st.info("Audio output will be available after translation.")
304
+
305
+ st.markdown('</div>', unsafe_allow_html=True)
306
+
307
+ # Tab 2: Image OCR
308
+ elif selected_tab == "Image OCR":
309
+ st.markdown('<div class="sub-header">📄 Image OCR and Translation</div>', unsafe_allow_html=True)
310
+
311
+ reader = load_ocr_reader(ocr_languages)
312
+
313
+ col1, col2 = st.columns([1, 1])
314
+
315
+ with col1:
316
+ st.markdown('<div class="card">', unsafe_allow_html=True)
317
+ st.markdown("### Upload Image")
318
+ st.write("Supported formats: JPG, PNG, JPEG")
319
+
320
+ uploaded_image = st.file_uploader(
321
+ "Drag and drop your image file here",
322
+ type=["jpg", "jpeg", "png"],
323
+ key="image_uploader"
324
+ )
325
+
326
+ if uploaded_image:
327
+ image = Image.open(uploaded_image)
328
+ st.image(image, caption="Uploaded Image", use_container_width=True)
329
+
330
+ if st.button("🔍 Extract Text", key="extract_btn"):
331
+ st.session_state.processing = True
332
+ progress_bar = simulate_progress()
333
+
334
+ try:
335
+ image_np = np.array(image)
336
+ results = reader.readtext(image_np)
337
+ st.session_state.extracted_text = " ".join([result[1] for result in results])
338
+
339
+ # Store the image with boxes for display later
340
+ if results:
341
+ st.session_state.image_with_boxes = draw_boxes(image, results)
342
+
343
+ st.success("✅ Text extraction complete!")
344
+ except Exception as e:
345
+ st.error(f"Error during text extraction: {str(e)}")
346
+ st.markdown('</div>', unsafe_allow_html=True)
347
+
348
+ with col2:
349
+ st.markdown('<div class="card">', unsafe_allow_html=True)
350
+ st.markdown("### Results")
351
+
352
+ tabs = st.tabs(["Extracted Text", "Text Detection", "Translation", "Audio Output"])
353
+
354
+ with tabs[0]:
355
+ if st.session_state.extracted_text:
356
+ st.text_area(
357
+ "Extracted Text:",
358
+ st.session_state.extracted_text,
359
+ height=150
360
+ )
361
+ else:
362
+ st.info("Extracted text will appear here after processing.")
363
+
364
+ with tabs[1]:
365
+ if hasattr(st.session_state, 'image_with_boxes'):
366
+ st.image(
367
+ st.session_state.image_with_boxes,
368
+ caption="Text Detection Visualization",
369
+ use_container_width=True
370
+ )
371
+ else:
372
+ st.info("Text detection visualization will appear here after processing.")
373
+
374
+ with tabs[2]:
375
+ if st.session_state.extracted_text:
376
+ if st.button("🌍 Translate Extracted Text", key="translate_ocr_btn"):
377
+ st.session_state.processing = True
378
+ progress_bar = simulate_progress()
379
+
380
+ try:
381
+ translation = asyncio.run(translator.translate(
382
+ st.session_state.extracted_text, dest=language
383
+ ))
384
+ st.session_state.translated_ocr_text = translation.text
385
+ st.success("✅ Translation complete!")
386
+ except Exception as e:
387
+ st.error(f"Error during translation: {str(e)}")
388
+
389
+ if st.session_state.translated_ocr_text:
390
+ st.text_area(
391
+ f"Translated to {googletrans.LANGUAGES[language].capitalize()}:",
392
+ st.session_state.translated_ocr_text,
393
+ height=150
394
+ )
395
+ else:
396
+ st.info("Translated text will appear here after processing.")
397
+
398
+ with tabs[3]:
399
+ if st.session_state.translated_ocr_text:
400
+ try:
401
+ audio_output = text_to_speech(st.session_state.translated_ocr_text, language)
402
+ st.audio(audio_output, format="audio/mp3")
403
+ st.download_button(
404
+ label="Download Audio",
405
+ data=audio_output,
406
+ file_name=f"ocr_translated_audio_{language}.mp3",
407
+ mime="audio/mp3"
408
+ )
409
+ except Exception as e:
410
+ st.error(f"Error generating speech: {str(e)}")
411
+ else:
412
+ st.info("Audio output will be available after translation.")
413
+
414
+ st.markdown('</div>', unsafe_allow_html=True)
415
+
416
+ # Tab 3: Help
417
+ elif selected_tab == "Help":
418
+ st.markdown('<div class="sub-header">❓ Help & Documentation</div>', unsafe_allow_html=True)
419
+
420
+ st.markdown('<div class="card">', unsafe_allow_html=True)
421
+ st.markdown("### Getting Started")
422
+ st.markdown("""
423
+ Welcome to Polyglot! This app helps you break language barriers with AI-powered transcription, translation, and OCR capabilities.
424
+
425
+ **Quick Start Guide:**
426
+ 1. Choose a feature from the navigation bar (Audio Transcription or Image OCR)
427
+ 2. Upload your file (audio or image)
428
+ 3. Process the file (transcribe or extract text)
429
+ 4. Translate the extracted text to your target language
430
+ 5. Generate and download speech from the translated text
431
+
432
+ For best results, use clear audio recordings and high-quality images with legible text.
433
+ """)
434
+ st.markdown('</div>', unsafe_allow_html=True)
435
+
436
+ col1, col2 = st.columns([1, 1])
437
+
438
+ with col1:
439
+ st.markdown('<div class="card">', unsafe_allow_html=True)
440
+ st.markdown("### Audio Transcription Tips")
441
+ st.markdown("""
442
+ - **Supported formats:** WAV, MP3, M4A
443
+ - **Best audio quality:** Clear speech, minimal background noise
444
+ - **Recommended duration:** 5 seconds to 10 minutes
445
+ - **Languages:** Multiple languages supported via Whisper model
446
+
447
+ **Troubleshooting:**
448
+ - If transcription is inaccurate, try reducing background noise
449
+ - For long files, allow extra processing time
450
+ - If you encounter errors, try a different audio format
451
+ """)
452
+ st.markdown('</div>', unsafe_allow_html=True)
453
+
454
+ with col2:
455
+ st.markdown('<div class="card">', unsafe_allow_html=True)
456
+ st.markdown("### Image OCR Tips")
457
+ st.markdown("""
458
+ - **Supported formats:** JPG, PNG, JPEG
459
+ - **Best image quality:** High resolution, good lighting, clear contrast
460
+ - **OCR languages:** Select appropriate language(s) for your text
461
+ - **Text styles:** Works with printed text and some handwriting
462
+
463
+ **Troubleshooting:**
464
+ - If text detection fails, try improving image contrast
465
+ - For complex layouts, results may vary
466
+ - Multi-language documents may require multiple language selections
467
+ """)
468
+ st.markdown('</div>', unsafe_allow_html=True)
469
+
470
+ st.markdown('<div class="card">', unsafe_allow_html=True)
471
+ st.markdown("### Frequently Asked Questions")
472
+
473
+ with st.expander("What languages are supported for translation?"):
474
+ st.write("Polyglot supports translation to and from over 100 languages using Google Translate's API.")
475
+
476
+ with st.expander("How accurate is the audio transcription?"):
477
+ st.write("The app uses OpenAI's Whisper model which provides good accuracy for clear speech. Performance may vary with accents, background noise, and audio quality.")
478
+
479
+ with st.expander("Can I process handwritten text?"):
480
+ st.write("Yes, EasyOCR can detect some handwritten text, but performance is best with printed text. Results depend on handwriting clarity and image quality.")
481
+
482
+ with st.expander("Is there a file size limit?"):
483
+ st.write("Streamlit has a default file size limit of 200MB, but we recommend keeping audio files under 10MB and images under 5MB for optimal performance.")
484
+
485
+ with st.expander("How can I improve the translation quality?"):
486
+ st.write("Ensure the transcription or text extraction is accurate first. Clear, grammatically correct source text leads to better translations.")
487
+
488
+ st.markdown('</div>', unsafe_allow_html=True)
489
+
490
+ # Footer
491
+ st.markdown("""
492
+ <div style="text-align: center; margin-top: 2rem; padding: 1rem; background-color: #f0f2f6; border-radius: 0.7rem;">
493
+ <p>Created with ❤️ using Streamlit, Whisper, EasyOCR, and Google Translate</p>
494
+ <p>© 2025 Polyglot - Breaking language barriers with AI</p>
495
+ </div>
496
+ """, unsafe_allow_html=True)
requirements-txt.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit>=1.28.0
3
+ numpy>=1.20.0
4
+ pandas>=1.3.0
5
+ Pillow>=9.0.0
6
+
7
+ # Audio processing
8
+ openai-whisper>=20230314
9
+ ffmpeg-python>=0.2.0
10
+ pydub>=0.25.1
11
+
12
+ # Text translation
13
+ googletrans==4.0.0rc1
14
+ gTTS>=2.3.1
15
+
16
+ # Image processing and OCR
17
+ easyocr>=1.6.2
18
+ opencv-python>=4.5.5
19
+ torch>=1.10.0
20
+ torchvision>=0.11.0
21
+
22
+ # Additional utilities
23
+ python-dotenv>=0.19.0
24
+ requests>=2.27.0
25
+ asyncio>=3.4.3
26
+
27
+ # Optional - for improved UI
28
+ matplotlib>=3.5.0
29
+ plotly>=5.5.0