Spaces:
Runtime error
Runtime error
File size: 1,813 Bytes
b2f3d99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import gradio as gr
import re
import json
from pyidaungsu import pds
# Function to split text into bite-sized paragraphs
def split_text_into_paragraphs(text):
# Define sentence-ending punctuation for Burmese
sentence_endings = re.compile(r'။|။\s+')
# Split text by sentences
sentences = sentence_endings.split(text)
# Join sentences into bite-sized paragraphs
paragraphs = []
paragraph = ""
for sentence in sentences:
if len(paragraph) + len(sentence) < 1000: # Adjust the size limit as needed
paragraph += sentence + "။"
else:
paragraphs.append(paragraph.strip())
paragraph = sentence + "။"
if paragraph:
paragraphs.append(paragraph.strip())
return paragraphs
# Function to extract keywords using the pyidaungsu library
def extract_keywords(paragraph):
# Tokenize the paragraph into words
words = pds.tokenize(paragraph, form="word")
# Extract keywords (this can be further improved with more advanced techniques)
keywords = [word for word in words if len(word) > 1] # Simple keyword extraction
return keywords
# Main processing function
def process_text(text):
paragraphs = split_text_into_paragraphs(text)
result = []
for para in paragraphs:
keywords = extract_keywords(para)
result.append({"paragraph": para, "keywords": keywords})
return json.dumps(result, ensure_ascii=False, indent=2)
# Create the Gradio interface
iface = gr.Interface(
fn=process_text,
inputs="text",
outputs="text",
title="Lazy Reader",
description="This app takes large Burmese text without line breaks as input and outputs a JSON of each paragraph and keywords to fetch images related to the paragraph."
)
# Launch the Gradio app
iface.launch()
|