import gradio as gr import re import json from pyidaungsu import pds # Function to split text into bite-sized paragraphs def split_text_into_paragraphs(text): # Define sentence-ending punctuation for Burmese sentence_endings = re.compile(r'။|။\s+') # Split text by sentences sentences = sentence_endings.split(text) # Join sentences into bite-sized paragraphs paragraphs = [] paragraph = "" for sentence in sentences: if len(paragraph) + len(sentence) < 1000: # Adjust the size limit as needed paragraph += sentence + "။" else: paragraphs.append(paragraph.strip()) paragraph = sentence + "။" if paragraph: paragraphs.append(paragraph.strip()) return paragraphs # Function to extract keywords using the pyidaungsu library def extract_keywords(paragraph): # Tokenize the paragraph into words words = pds.tokenize(paragraph, form="word") # Extract keywords (this can be further improved with more advanced techniques) keywords = [word for word in words if len(word) > 1] # Simple keyword extraction return keywords # Main processing function def process_text(text): paragraphs = split_text_into_paragraphs(text) result = [] for para in paragraphs: keywords = extract_keywords(para) result.append({"paragraph": para, "keywords": keywords}) return json.dumps(result, ensure_ascii=False, indent=2) # Create the Gradio interface iface = gr.Interface( fn=process_text, inputs="text", outputs="text", title="Lazy Reader", description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph." ) # Launch the Gradio app iface.launch()