Spaces:

a-guy-from-burma
/

lazy-reader-paragraph-text-burmese

Runtime error

App Files Files Community

lazy-reader-paragraph-text-burmese / app.py

a-guy-from-burma

Create app.py

b2f3d99 verified about 1 year ago

raw

history blame contribute delete

1.81 kB

	import gradio as gr
	import re
	import json
	from pyidaungsu import pds

	# Function to split text into bite-sized paragraphs
	def split_text_into_paragraphs(text):
	# Define sentence-ending punctuation for Burmese
	sentence_endings = re.compile(r'။\|။\s+')

	# Split text by sentences
	sentences = sentence_endings.split(text)

	# Join sentences into bite-sized paragraphs
	paragraphs = []
	paragraph = ""
	for sentence in sentences:
	if len(paragraph) + len(sentence) < 1000: # Adjust the size limit as needed
	paragraph += sentence + "။"
	else:
	paragraphs.append(paragraph.strip())
	paragraph = sentence + "။"
	if paragraph:
	paragraphs.append(paragraph.strip())

	return paragraphs

	# Function to extract keywords using the pyidaungsu library
	def extract_keywords(paragraph):
	# Tokenize the paragraph into words
	words = pds.tokenize(paragraph, form="word")
	# Extract keywords (this can be further improved with more advanced techniques)
	keywords = [word for word in words if len(word) > 1] # Simple keyword extraction
	return keywords

	# Main processing function
	def process_text(text):
	paragraphs = split_text_into_paragraphs(text)
	result = []
	for para in paragraphs:
	keywords = extract_keywords(para)
	result.append({"paragraph": para, "keywords": keywords})
	return json.dumps(result, ensure_ascii=False, indent=2)

	# Create the Gradio interface
	iface = gr.Interface(
	fn=process_text,
	inputs="text",
	outputs="text",
	title="Lazy Reader",
	description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph."
	)

	# Launch the Gradio app
	iface.launch()