import gradio as gr
from gradio_rich_textbox import RichTextbox
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES , LANGUAGE_NAME_TO_CODE , text_source_language_codes
from gradio_client import Client
from dotenv import load_dotenv
import requests
from io import BytesIO
import cohere
import os
import re
import pandas as pd
import pydub
from pydub import AudioSegment
from pydub.utils import make_chunks
from pathlib import Path
import hashlib
title = "# Welcome to AyaTonic"
description = "Learn a New Language With Aya"
# Load environment variables
load_dotenv()
COHERE_API_KEY = os.getenv('CO_API_KEY')
SEAMLESSM4T = os.getenv('SEAMLESSM4T')
df = pd.read_csv("lang_list.csv")
choices = df["name"].to_list()
inputlanguage = ""
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs: Example: (.?). Don't change other format of span tag other than color and the (.?)."
translatetextinst = "\n\nthe above text is a learning aid. you must use markdown format to translate the above into {inputlanguage} :'"
patterns = {
"red": r'(.*?)',
"blue": r'(.*?)',
"green": r'(.*?)',
}
# Dictionaries to hold the matches
matches = {
"red": [],
"blue": [],
"green": [],
}
co = cohere.Client(COHERE_API_KEY)
audio_client = Client(SEAMLESSM4T)
def get_language_code(language_name):
"""
Extracts the first two letters of the language code based on the language name.
"""
try:
code = df.loc[df['name'].str.lower() == language_name.lower(), 'code'].values[0]
return code
except IndexError:
print(f"Language name '{language_name}' not found.")
return None
def translate_text(text, inputlanguage, target_language):
"""
Translates text.
"""
# Ensure you format the instruction string within the function body
instructions = translatetextinst.format(inputlanguage=inputlanguage)
producetext_formatted = producetext.format(target_language=target_language)
prompt = f"{text}{producetext_formatted}\n{instructions}"
response = co.generate(
model='c4ai-aya',
prompt=prompt,
max_tokens=2986,
temperature=0.6,
k=0,
stop_sequences=[],
return_likelihoods='NONE'
)
return response.generations[0].text
class LongAudioProcessor:
def __init__(self, audio_client, api_key=None):
self.client = audio_client
self.process_audio_to_text = process_audio_to_text
self.api_key = api_key
def process_long_audio(self, audio_path, inputlanguage, outputlanguage, chunk_length_ms=20000):
"""
Process audio files longer than 29 seconds by chunking them into smaller segments.
"""
audio = AudioSegment.from_file(audio_path)
chunks = make_chunks(audio, chunk_length_ms)
full_text = ""
for i, chunk in enumerate(chunks):
chunk_name = f"chunk{i}.wav"
with open(chunk_name, 'wb') as file:
chunk.export(file, format="wav")
try:
result = self.process_audio_to_text(chunk_name, inputlanguage=inputlanguage, outputlanguage=outputlanguage)
full_text += " " + result.strip()
except Exception as e:
print(f"Error processing {chunk_name}: {e}")
finally:
if os.path.exists(chunk_name):
os.remove(chunk_name)
return full_text.strip()
class TaggedPhraseExtractor:
def __init__(self, text=''):
self.text = text
self.patterns = patterns
def set_text(self, text):
"""Set the text to search within."""
self.text = text
def add_pattern(self, color, pattern):
"""Add a new color and its associated pattern."""
self.patterns[color] = pattern
def extract_phrases(self):
"""Extract phrases for all colors and patterns added, including the three longest phrases."""
matches = {}
for color, pattern in self.patterns.items():
found_phrases = re.findall(pattern, self.text)
sorted_phrases = sorted(found_phrases, key=len, reverse=True)
matches[color] = sorted_phrases[:3]
return matches
def print_phrases(self):
"""Extract phrases and print them, including the three longest phrases."""
matches = self.extract_phrases()
for color, data in matches.items():
print(f"Phrases with color {color}:")
for phrase in data['all_phrases']:
print(f"- {phrase}")
print(f"\nThree longest phrases for color {color}:")
for phrase in data['top_three_longest']:
print(f"- {phrase}")
print()
def process_audio_to_text(audio_path, inputlanguage="English", outputlanguage="English"):
"""
Convert audio input to text using the Gradio client.
"""
audio_client = Client(SEAMLESSM4T)
result = audio_client.predict(
audio_path,
inputlanguage,
outputlanguage,
api_name="/s2tt"
)
print("Audio Result: ", result)
return result[0]
def process_text_to_audio(text, translatefrom="English", translateto="English"):
"""
Convert text input to audio using the Gradio client and return a URL to the generated audio.
"""
try:
# Assuming audio_client.predict is correctly set up and returns a tuple (local_file_path, translated_text)
result = audio_client.predict(
text,
translatefrom,
translateto,
api_name="/t2st"
)
if not isinstance(result, tuple) or len(result) < 2:
raise ValueError("Unexpected result format from audio_client.predict")
# Print or log the raw API response for inspection
print("Raw API Response:", result)
# Initialize variables
audio_file_path = ""
# Process the result
if result:
for item in result:
if isinstance(item, str):
# Check if the item is a URL pointing to an audio file or a base64 encoded string
if any(ext in item.lower() for ext in ['.mp3', '.wav', '.ogg']) or is_base64(item):
audio_file_path = item
break
if not audio_file_path:
raise ValueError("No audio file path found in the response")
# If the response is a direct file path or a base64 string, handle accordingly
# For simplicity, we're returning the URL or base64 string directly
return audio_file_path
except Exception as e:
print(f"Error processing text to audio: {e}")
return ""
def save_audio_data_to_file(audio_data, directory="audio_files", filename="output_audio.wav"):
"""
Save audio data to a file and return the file path.
"""
os.makedirs(directory, exist_ok=True)
file_path = os.path.join(directory, filename)
with open(file_path, 'wb') as file:
file.write(audio_data)
return file_path
# Ensure the function that reads the audio file checks if the path is a file
def read_audio_file(file_path):
"""
Read and return the audio file content if the path is a file.
"""
if os.path.isfile(file_path):
with open(file_path, 'rb') as file:
return file.read()
else:
raise ValueError(f"Expected a file path, got a directory: {file_path}")
def initialize_ocr_models():
"""
Load the detection and recognition models along with their processors.
"""
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()
return det_processor, det_model, rec_model, rec_processor
class OCRProcessor:
def __init__(self, lang_code=["en"]):
self.lang_code = lang_code
self.det_processor, self.det_model, self.rec_model, self.rec_processor = initialize_ocr_models()
def process_image(self, image):
"""
Process a PIL image and return the OCR text.
"""
predictions = run_ocr([image], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
return predictions[0]
def process_pdf(self, pdf_path):
"""
Process a PDF file and return the OCR text.
"""
predictions = run_ocr([pdf_path], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
return predictions[0]
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
lang_code = get_language_code(translatefrom)
ocr_processor = OCRProcessor(lang_code)
final_text = text
print("Image :", image)
if image is not None:
ocr_prediction = ocr_processor.process_image(image)
for idx in range(len((list(ocr_prediction)[0][1]))):
final_text += " "
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
if file is not None:
if file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
pil_image = Image.open(file)
ocr_prediction = ocr_processor.process_image(pil_image)
for idx in range(len((list(ocr_prediction)[0][1]))):
final_text += " "
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
elif file.name.lower().endswith('.pdf'):
ocr_prediction = ocr_processor.process_pdf(file.name)
for idx in range(len((list(ocr_prediction)[0][1]))):
final_text += " "
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
else:
final_text += "\nUnsupported file type."
print("OCR Text: ", final_text)
if audio is not None:
long_audio_processor = LongAudioProcessor(audio_client)
audio_text = long_audio_processor.process_long_audio(audio, inputlanguage=translatefrom, outputlanguage=translateto)
final_text += "\n" + audio_text
final_text_with_producetext = final_text + producetext.format(target_language=translateto)
response = co.generate(
model='c4ai-aya',
prompt=final_text_with_producetext,
max_tokens=1024,
temperature=0.5
)
# add graceful handling for errors (overflow)
generated_text = response.generations[0].text
print("Generated Text: ", generated_text)
generated_text_with_format = generated_text + "\n" + formatinputstring
response = co.generate(
model='command-nightly',
prompt=generated_text_with_format,
max_tokens=4000,
temperature=0.5
)
processed_text = response.generations[0].text
audio_output = process_text_to_audio(processed_text, translateto, translateto)
extractor = TaggedPhraseExtractor(final_text)
matches = extractor.extract_phrases()
top_phrases = []
for color, phrases in matches.items():
top_phrases.extend(phrases)
while len(top_phrases) < 3:
top_phrases.append("")
audio_outputs = []
translations = []
for phrase in top_phrases:
if phrase:
translated_phrase = translate_text(phrase, translatefrom=translatefrom, translateto=translateto)
translations.append(translated_phrase)
target_audio = process_text_to_audio(phrase, translatefrom=translateto, translateto=translateto)
native_audio = process_text_to_audio(translated_phrase, translatefrom=translatefrom, translateto=translatefrom)
audio_outputs.append((target_audio, native_audio))
else:
translations.append("")
audio_outputs.append(("", ""))
return final_text, audio_output, top_phrases, translations, audio_outputs
inputs = [
gr.Dropdown(choices=choices, label="Your Native Language"),
gr.Dropdown(choices=choices, label="Language To Learn"),
gr.Audio(sources="microphone", type="filepath", label="Mic Input"),
gr.Image(type="pil", label="Camera Input"),
gr.Textbox(lines=2, label="Text Input"),
gr.File(label="File Upload")
]
outputs = [
RichTextbox(label="Processed Text"),
gr.Audio(label="Audio"),
gr.Textbox(label="Focus 1"),
gr.Textbox(label="Translated Phrases 1"),
gr.Audio(label="Audio Output (Native Language) 1"),
gr.Audio(label="Audio Output (Target Language) 1"),
gr.Textbox(label="Focus 2"),
gr.Textbox(label="Translated Phrases 2"),
gr.Audio(label="Audio Output (Native Language) 2"),
gr.Audio(label="Audio Output (Target Language) 2"),
gr.Textbox(label="Focus 3"),
gr.Textbox(label="Translated Phrases 3"),
gr.Audio(label="Audio Output (Native Language) 3"),
gr.Audio(label="Audio Output (Target Language) 3")
]
def update_outputs(inputlanguage, target_language, audio, image, text, file):
processed_text, audio_output_path, top_phrases, translations, audio_outputs = process_input(
image=image, file=file, audio=audio, text=text,
translateto=target_language, translatefrom=inputlanguage
)
output_tuple = (
processed_text, # RichTextbox content
audio_output_path, # Main audio output
top_phrases[0] if len(top_phrases) > 0 else "", # Focus 1
translations[0] if len(translations) > 0 else "", # Translated Phrases 1
audio_outputs[0][0] if len(audio_outputs) > 0 else "", # Audio Output (Native Language) 1
audio_outputs[0][1] if len(audio_outputs) > 0 else "", # Audio Output (Target Language) 1
top_phrases[1] if len(top_phrases) > 1 else "", # Focus 2
translations[1] if len(translations) > 1 else "", # Translated Phrases 2
audio_outputs[1][0] if len(audio_outputs) > 1 else "", # Audio Output (Native Language) 2
audio_outputs[1][1] if len(audio_outputs) > 1 else "", # Audio Output (Target Language) 2
top_phrases[2] if len(top_phrases) > 2 else "", # Focus 3
translations[2] if len(translations) > 2 else "", # Translated Phrases 3
audio_outputs[2][0] if len(audio_outputs) > 2 else "", # Audio Output (Native Language) 3
audio_outputs[2][1] if len(audio_outputs) > 2 else "" # Audio Output (Target Language) 3
)
return output_tuple
def interface_func(inputlanguage, target_language, audio, image, text, file):
return update_outputs(inputlanguage, target_language, audio, image, text, file)
iface = gr.Interface(fn=interface_func, inputs=inputs, outputs=outputs, title=title, description=description)
if __name__ == "__main__":
iface.launch()