a-guy-from-burma's picture
Create app.py
b2f3d99 verified
import gradio as gr
import re
import json
from pyidaungsu import pds
# Function to split text into bite-sized paragraphs
def split_text_into_paragraphs(text):
# Define sentence-ending punctuation for Burmese
sentence_endings = re.compile(r'။|။\s+')
# Split text by sentences
sentences = sentence_endings.split(text)
# Join sentences into bite-sized paragraphs
paragraphs = []
paragraph = ""
for sentence in sentences:
if len(paragraph) + len(sentence) < 1000: # Adjust the size limit as needed
paragraph += sentence + "။"
else:
paragraphs.append(paragraph.strip())
paragraph = sentence + "။"
if paragraph:
paragraphs.append(paragraph.strip())
return paragraphs
# Function to extract keywords using the pyidaungsu library
def extract_keywords(paragraph):
# Tokenize the paragraph into words
words = pds.tokenize(paragraph, form="word")
# Extract keywords (this can be further improved with more advanced techniques)
keywords = [word for word in words if len(word) > 1] # Simple keyword extraction
return keywords
# Main processing function
def process_text(text):
paragraphs = split_text_into_paragraphs(text)
result = []
for para in paragraphs:
keywords = extract_keywords(para)
result.append({"paragraph": para, "keywords": keywords})
return json.dumps(result, ensure_ascii=False, indent=2)
# Create the Gradio interface
iface = gr.Interface(
fn=process_text,
inputs="text",
outputs="text",
title="Lazy Reader",
description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph."
)
# Launch the Gradio app
iface.launch()