Spaces:

chaouch
/

El_profesor

Sleeping

App Files Files Community

El_profesor / app.py

chaouch

done

f274761 about 1 year ago

raw

history blame

5.91 kB

	import gradio as gr
	from transformers import pipeline
	import numpy as np
	import pytesseract
	import cv2
	from PIL import Image
	from evaluate import load
	import librosa

	asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
	wer = load("wer")


	def extract_text(image):
	result = pytesseract.image_to_data(image, output_type='dict')
	n_boxes = len(result['level'])
	data = {}
	k = 0
	for i in range(n_boxes):
	if result['conf'][i] >= 0.3 and result['text'][i] != '' and result['conf'][i] != -1:
	data[k] = {}
	(x, y, w, h) = (result['left'][i], result['top']
	[i], result['width'][i], result['height'][i])
	data[k]["coordinates"] = (x, y, w, h)
	text, conf = result['text'][k], result['conf'][k]
	data[k]["text"] = text
	data[k]["conf"] = conf
	k += 1
	return data


	def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
	image_array = np.array(image)
	image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
	cv2.rectangle(image_array, (x, y), (x + w, y + h), color, thickness)
	return Image.fromarray(cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB))


	def transcribe(audio):
	if isinstance(audio, str): # If audio is a file path
	y, sr = librosa.load(audio)
	elif isinstance(audio, tuple) and len(audio) == 2: # If audio is (sampling_rate, raw_audio)
	sr, y = audio
	y = y.astype(np.float32)
	else:
	raise ValueError("Invalid input. Audio should be a file path or a tuple of (sampling_rate, raw_audio).")

	y /= np.max(np.abs(y))

	# Call your ASR (Automatic Speech Recognition) function here
	# For now, let's assume it's called 'asr'
	transcribed_text = asr({"sampling_rate": sr, "raw": y})["text"]

	return transcribed_text


	def clean_transcription(transcription):
	text = transcription.lower()
	words = text.split()
	cleaned_words = [words[0]]
	for word in words[1:]:
	if word != cleaned_words[-1]:
	cleaned_words.append(word)
	return ' '.join(cleaned_words)


	def match(refence, spoken):
	wer_score = wer.compute(references=[refence], predictions=[spoken])
	score = 1 - wer_score
	return score


	def split_to_l(text, answer):
	l = len(answer.split(" "))
	text_words = text.split(" ")
	chunks = []
	indices = []
	for i in range(0, len(text_words), l):
	chunk = " ".join(text_words[i: i + l])
	chunks.append(chunk)
	indices.append(i)
	return chunks, indices, l


	def reindex_data(data, index, l):
	reindexed_data = {}
	for i in range(l):
	original_index = index + i
	reindexed_data[i] = data[original_index]
	return reindexed_data


	def process_image(im, data):
	im_array = np.array(im)
	hg, wg, _ = im_array.shape
	text_y = np.max([data[i]["coordinates"][1]
	for i in range(len(data))])
	text_x = np.max([data[i]["coordinates"][0]
	for i in range(len(data))])
	text_start_x = np.min([data[i]["coordinates"][0]
	for i in range(len(data))])
	text_start_y = np.min([data[i]["coordinates"][1]
	for i in range(len(data))])
	max_height = int(np.mean([data[i]["coordinates"][3]
	for i in range(len(data))]))
	max_width = int(np.mean([data[i]["coordinates"][2]
	for i in range(len(data))]))
	text = [data[i]["text"] for i in range(len(data))]
	wall = np.zeros((hg, wg, 3), np.uint8)

	wall[text_start_y:text_y + max_height, text_start_x:text_x + max_width] = \
	im_array[text_start_y:text_y + max_height,
	text_start_x:text_x + max_width, :]

	for i in range(1, len(data)):
	x, y, w, h = data[i]["coordinates"]
	wall = draw_rectangle(wall, x, y, w, h)
	return wall


	def run(stream, image):
	data = extract_text(image)
	im_text_ = [data[i]["text"] for i in range(len(data))]
	im_text = " ".join(im_text_)
	trns_text = transcribe(stream)
	chunks, index, l = split_to_l(im_text, trns_text)
	im_array = np.array(Image.open(image))
	data2 = None
	for i in range(len(chunks)):
	if match(chunks[i], trns_text) > 0.1:
	data2 = reindex_data(data, index[i], l)
	break
	if data2 is not None:
	return process_image(im_array, data2)
	else:
	return im_array

	demo = gr.Blocks()


	demo1 = gr.Interface(
	run,
	[gr.Audio(sources=["microphone"] , type="numpy"), gr.Image(
	type="filepath", label="Image")],
	gr.Image(type="pil", label="output Image"),
	)
	demo2 = gr.Interface(
	run,
	[gr.Audio(sources=["upload"]), gr.Image(
	type="filepath", label="Image")],
	gr.Image(type="pil", label="output Image")
	)
	with demo:
	gr.TabbedInterface([demo1, demo2],
	["Microphone", "Audio File"])

	demo.launch()
	"""
	data = extract_text(im)
	im_text_ = [data[i]["text"] for i in range(len(data))]
	im_text = " ".join(im_text_)
	trns_text = transcribe_wav("tmpmucht0kh.wav")
	chunks, index, l = split_to_l(im_text, trns_text)
	im_array = np.array(Image.open(im))
	for i in range(len(chunks)):
	if match(chunks[i], trns_text) > 0.5:
	print(chunks[i])
	print(match(chunks[i], trns_text))
	print(index[i])
	print(l)
	print(im_array.shape)
	print(fuse_rectangles(im_array, data, index[i], l))

	strem = "tmpq0eha4we.wav"
	im = "the-king-and-three-sisters-around-the-world-stories-for-children.png"
	text = "A KING AND THREE SISTERS"
	che_text = "A KING AND THREE SISTERS"
	print(match(text, che_text))
	data = extract_text(im)
	text_transcript = transcribe_wav(strem)
	print(text_transcript)
	im_text_ = [data[i]["text"] for i in range(len(data))]
	im_text = " ".join(im_text_)
	print(im_text)
	wall = run(strem, im)
	wall.show()"""