Spaces:

MultiTransformer
/

AyaTonic

Runtime error

AyaTonic / app.py

tonic

improve the interface , add parsing for longest phrases , language code quick fix for surya, adding translation with aya

5701b30 over 1 year ago

raw

history blame

11.2 kB

	import gradio as gr
	from gradio_rich_textbox import RichTextbox
	from PIL import Image
	from surya.ocr import run_ocr
	from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
	from surya.model.recognition.model import load_model as load_rec_model
	from surya.model.recognition.processor import load_processor as load_rec_processor
	from lang_list import TEXT_SOURCE_LANGUAGE_NAMES , LANGUAGE_NAME_TO_CODE , text_source_language_codes
	from gradio_client import Client
	from dotenv import load_dotenv
	import requests
	from io import BytesIO
	import cohere
	import os
	import re
	import pandas as pd


	title = "# Welcome to AyaTonic"
	description = "Learn a New Language With Aya"
	# Load environment variables
	load_dotenv()
	COHERE_API_KEY = os.getenv('CO_API_KEY')
	SEAMLESSM4T = os.getenv('SEAMLESSM4T')
	df = pd.read_csv("lang_list.csv")
	choices = df["name"].to_list()
	inputlanguage = ""
	producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
	formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
	translatetextinst = "\n\nthe above text is a learning aid. you must use markdown format to translate the above into {inputlanguage} :'"
	# Regular expression patterns for each color
	patterns = {
	"red": r'<span style="color: red;">(.*?)</span>',
	"blue": r'<span style="color: blue;">(.*?)</span>',
	"green": r'<span style="color: green;">(.*?)</span>',
	}

	# Dictionaries to hold the matches
	matches = {
	"red": [],
	"blue": [],
	"green": [],
	}

	co = cohere.Client(COHERE_API_KEY)
	audio_client = Client(SEAMLESSM4T)

	def get_language_code(language_name):
	"""
	Extracts the first two letters of the language code based on the language name.
	"""
	code = df.loc[df['name'] == language_name, 'code'].values[0]
	return code[:2]

	def translate_text(text, instructions=translatetextinst):
	"""
	- text (str): The initial text.
	Returns:
	- str: The translated text response.
	"""
	prompt = f"{text}{instructions}"
	response = co.generate(
	model='c4ai-aya',
	prompt=prompt,
	max_tokens=2986,
	temperature=0.6,
	k=0,
	stop_sequences=[],
	return_likelihoods='NONE'
	)
	return response.generations[0].text

	class TaggedPhraseExtractor:
	def __init__(self, text=''):
	self.text = text
	self.patterns = {}

	def set_text(self, text):
	"""Set the text to search within."""
	self.text = text

	def add_pattern(self, color, pattern):
	"""Add a new color and its associated pattern."""
	self.patterns[color] = pattern

	def extract_phrases(self):
	"""Extract phrases for all colors and patterns added, including the three longest phrases."""
	matches = {}
	three_matches = {}
	for color, pattern in self.patterns.items():
	found_phrases = re.findall(pattern, self.text)
	sorted_phrases = sorted(found_phrases, key=len, reverse=True)
	matches[color] = {
	'all_phrases': found_phrases,
	'top_three_longest': sorted_phrases[:3]
	}
	three_matches = sorted_phrases[:3]
	return matches , three_matches

	def print_phrases(self):
	"""Extract phrases and print them, including the three longest phrases."""
	matches = self.extract_phrases()
	for color, data in matches.items():
	print(f"Phrases with color {color}:")
	for phrase in data['all_phrases']:
	print(f"- {phrase}")
	print(f"\nThree longest phrases for color {color}:")
	for phrase in data['top_three_longest']:
	print(f"- {phrase}")
	print()

	def process_audio_to_text(audio_path, inputlanguage="English", outputlanguage="English"):
	"""
	Convert audio input to text using the Gradio client.
	"""
	audio_client = Client(SEAMLESSM4T)
	result = audio_client.predict(
	audio_path,
	inputlanguage,
	outputlanguage,
	api_name="/s2tt"
	)
	print("Audio Result: ", result)
	return result[0]

	def process_text_to_audio(text, translatefrom="English", translateto="English"):
	"""
	Convert text input to audio using the Gradio client.
	"""
	audio_client = Client(SEAMLESSM4T)
	result = audio_client.predict(
	text,
	translatefrom,
	translateto,
	api_name="/t2st"
	)
	return result[0]

	class OCRProcessor:
	def __init__(self, lang_code=["en"]):
	self.lang_code = lang_code
	self.det_processor, self.det_model = load_det_processor(), load_det_model()
	self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()

	def process_image(self, image):
	"""
	Process a PIL image and return the OCR text.
	"""
	predictions = run_ocr([image], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
	return predictions[0]

	def process_pdf(self, pdf_path):
	"""
	Process a PDF file and return the OCR text.
	"""
	predictions = run_ocr([pdf_path], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
	return predictions[0]

	def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
	lang_code = get_language_code(translatefrom)
	ocr_processor = OCRProcessor(lang_code)
	final_text = text
	if image is not None:
	ocr_prediction = ocr_processor.process_image(image)
	# gettig text from ocr object
	for idx in range(len((list(ocr_prediction)[0][1]))):
	final_text += " "
	final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
	if file is not None:
	if file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
	pil_image = Image.open(file)
	ocr_prediction = ocr_processor.process_image(pil_image)
	# gettig text from ocr object
	for idx in range(len((list(ocr_prediction)[0][1]))):
	final_text += " "
	final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
	elif file.name.lower().endswith('.pdf'):
	ocr_prediction = ocr_processor.process_pdf(file.name)
	# gettig text from ocr object
	for idx in range(len((list(ocr_prediction)[0][1]))):
	final_text += " "
	final_text += list((list(ocr_prediction)[0][1])[idx])[1][1]
	else:
	final_text += "\nUnsupported file type."
	print("OCR Text: ", final_text)
	if audio is not None:
	audio_text = process_audio_to_text(audio)
	final_text += "\n" + audio_text

	final_text_with_producetext = final_text + producetext

	response = co.generate(
	model='c4ai-aya',
	prompt=final_text_with_producetext,
	max_tokens=1024,
	temperature=0.5
	)
	# add graceful handling for errors (overflow)
	generated_text = response.generations[0].text
	print("Generated Text: ", generated_text)
	generated_text_with_format = generated_text + "\n" + formatinputstring
	response = co.generate(
	model='command-nightly',
	prompt=generated_text_with_format,
	max_tokens=4000,
	temperature=0.5
	)
	processed_text = response.generations[0].text

	audio_output = process_text_to_audio(processed_text, translateto, translateto)

	extractor = TaggedPhraseExtractor(processed_text)
	longest_phrases = extractor.get_longest_phrases()

	# Translate the longest phrases back into the native language
	translated_phrases = [translate_text(phrase, translateto, translatefrom) for phrase in longest_phrases]

	# Convert the original and translated phrases to audio
	audio_samples = {
	"target_language": [text_to_audio(phrase, translateto) for phrase in longest_phrases],
	"native_language": [text_to_audio(phrase, translatefrom) for phrase in translated_phrases]
	}

	return audio_output, processed_text, audio_samples, longest_phrases, translated_phrases


	def main():
	with gr.Blocks() as demo:
	gr.Markdown(title)
	gr.Markdown(description)

	with gr.Row():
	input_language = gr.Dropdown(choices=choices, label="Your Native Language")
	target_language = gr.Dropdown(choices=choices, label="Language To Learn")

	with gr.Accordion("Talk To 🌟AyaTonic"):
	with gr.Tab("🤙🏻Audio & Text"):
	audio_input = gr.Audio(sources="microphone", type="filepath", label="Mic Input")
	text_input = gr.Textbox(lines=2, label="Text Input")
	with gr.Tab("📸Image & File"):
	image_input = gr.Image(type="pil", label="Camera Input")
	file_input = gr.File(label="File Upload")

	process_button = gr.Button("🌟AyaTonic")

	processed_text_output = RichTextbox(label="Processed Text")
	longest_phrases_1 = gr.Textbox(label="Focus")
	translated_phrases_output_1 = gr.Textbox(label="Translated Phrases")
	audio_output_native_phrase_1 = gr.Audio(label="Audio Output (Native Language)")
	audio_output_target_phrase_1 = gr.Audio(label="Audio Output (Target Language)")
	longest_phrases_2 = gr.Textbox(label="Focus")
	translated_phrases_output_2 = gr.Textbox(label="Translated Phrases")
	audio_output_native_phrase_2 = gr.Audio(label="Audio Output (Native Language)")
	audio_output_target_phrase_2 = gr.Audio(label="Audio Output (Target Language)")
	longest_phrases_3 = gr.Textbox(label="Focus")
	translated_phrases_output_3 = gr.Textbox(label="Translated Phrases")
	audio_output_native_phrase_3 = gr.Audio(label="Audio Output (Native Language)")
	audio_output_target_phrase_3 = gr.Audio(label="Audio Output (Target Language)")

	def update_outputs(image, file, audio, text, input_language, target_language):
	processed_text, audio_samples, longest_phrases, translated_phrases = process_input(
	image, file, audio, text, input_language, target_language)
	return processed_text, audio_samples['native_language'], audio_samples['target_language'], "\n".join(longest_phrases), "\n".join(translated_phrases) # Fix this

	process_button.click(
	fn=update_outputs,
	inputs=[image_input, file_input, audio_input, text_input, input_language, target_language],
	outputs=[processed_text_output, audio_output_native_phrase_1, audio_output_target_phrase_1, longest_phrases_1, translated_phrases_output_1, audio_output_native_phrase_2, audio_output_target_phrase_2, longest_phrases_2, translated_phrases_output_2, audio_output_native_phrase_3, audio_output_target_phrase_3, longest_phrases_3, translated_phrases_output_3] #add education output
	)

	if __name__ == "__main__":
	main()