Spaces:

TeacherPuffy
/

CreateBookPackage

Sleeping

App Files Files Community

TeacherPuffy commited on Jan 21

Commit

f108df6

verified ·

1 Parent(s): 73d12a0

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -13

app.py CHANGED Viewed

@@ -2,26 +2,44 @@ import gradio as gr
 from datasets import load_dataset
 import tempfile
 import re
-from langdetect import detect
-def is_english(text):
-    """Check if the text is in English."""
-    try:
-        return detect(text) == 'en'
-    except:
-        return False
 def clean_text(text):
-    """Remove non-English text and ** from the text."""
     # Remove **
     text = re.sub(r'\*\*', '', text)
-    # Split text into sentences and filter out non-English sentences
     sentences = re.split(r'(?<=[.!?])\s+', text)
-    cleaned_sentences = [s for s in sentences if is_english(s)]
     return ' '.join(cleaned_sentences)
 def combine_dataset_texts(dataset_name, split, text_column):
     try:
         # Load the dataset from Hugging Face Hub
@@ -34,11 +52,11 @@ def combine_dataset_texts(dataset_name, split, text_column):
         # Combine all texts into a single string without separating datapoints
         combined_text = " ".join([example[text_column] for example in dataset])
-        # Clean the text: remove non-English and **
         cleaned_text = clean_text(combined_text)
-        # Insert a newline after each period (.) except for ."
-        processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text)
         # Create a temporary file
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:

 from datasets import load_dataset
 import tempfile
 import re
+# List of common titles that end with a period
+TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."}
+def is_latin(text):
+    """Check if the text contains only Latin characters."""
+    # Regex to match non-Latin characters
+    return not re.search(r'[^\x00-\x7F]', text)
 def clean_text(text):
+    """Remove non-Latin text and ** from the text."""
     # Remove **
     text = re.sub(r'\*\*', '', text)
+    # Split text into sentences and filter out non-Latin sentences
     sentences = re.split(r'(?<=[.!?])\s+', text)
+    cleaned_sentences = [s for s in sentences if is_latin(s)]
     return ' '.join(cleaned_sentences)
+def process_text(text):
+    """Insert a newline after periods, except for titles and ." """
+    # Split text into words
+    words = text.split()
+    processed_words = []
+    for i, word in enumerate(words):
+        # Check if the word is a title (e.g., Mr., Mrs.)
+        if word in TITLES:
+            processed_words.append(word)
+        # Check if the word ends with a period and is not followed by a quote
+        elif word.endswith('.') and not word.endswith('."'):
+            processed_words.append(word + '\n')
+        else:
+            processed_words.append(word)
+    return ' '.join(processed_words)
 def combine_dataset_texts(dataset_name, split, text_column):
     try:
         # Load the dataset from Hugging Face Hub
         # Combine all texts into a single string without separating datapoints
         combined_text = " ".join([example[text_column] for example in dataset])
+        # Clean the text: remove non-Latin and **
         cleaned_text = clean_text(combined_text)
+        # Process the text: insert newlines after periods, except for titles and ."
+        processed_text = process_text(cleaned_text)
         # Create a temporary file
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: