Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,26 +2,44 @@ import gradio as gr
|
|
2 |
from datasets import load_dataset
|
3 |
import tempfile
|
4 |
import re
|
5 |
-
from langdetect import detect
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
13 |
|
14 |
def clean_text(text):
|
15 |
-
"""Remove non-
|
16 |
# Remove **
|
17 |
text = re.sub(r'\*\*', '', text)
|
18 |
|
19 |
-
# Split text into sentences and filter out non-
|
20 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
21 |
-
cleaned_sentences = [s for s in sentences if
|
22 |
|
23 |
return ' '.join(cleaned_sentences)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def combine_dataset_texts(dataset_name, split, text_column):
|
26 |
try:
|
27 |
# Load the dataset from Hugging Face Hub
|
@@ -34,11 +52,11 @@ def combine_dataset_texts(dataset_name, split, text_column):
|
|
34 |
# Combine all texts into a single string without separating datapoints
|
35 |
combined_text = " ".join([example[text_column] for example in dataset])
|
36 |
|
37 |
-
# Clean the text: remove non-
|
38 |
cleaned_text = clean_text(combined_text)
|
39 |
|
40 |
-
#
|
41 |
-
processed_text =
|
42 |
|
43 |
# Create a temporary file
|
44 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
|
|
|
2 |
from datasets import load_dataset
|
3 |
import tempfile
|
4 |
import re
|
|
|
5 |
|
6 |
+
# List of common titles that end with a period
|
7 |
+
TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."}
|
8 |
+
|
9 |
+
def is_latin(text):
|
10 |
+
"""Check if the text contains only Latin characters."""
|
11 |
+
# Regex to match non-Latin characters
|
12 |
+
return not re.search(r'[^\x00-\x7F]', text)
|
13 |
|
14 |
def clean_text(text):
|
15 |
+
"""Remove non-Latin text and ** from the text."""
|
16 |
# Remove **
|
17 |
text = re.sub(r'\*\*', '', text)
|
18 |
|
19 |
+
# Split text into sentences and filter out non-Latin sentences
|
20 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
21 |
+
cleaned_sentences = [s for s in sentences if is_latin(s)]
|
22 |
|
23 |
return ' '.join(cleaned_sentences)
|
24 |
|
25 |
+
def process_text(text):
|
26 |
+
"""Insert a newline after periods, except for titles and ." """
|
27 |
+
# Split text into words
|
28 |
+
words = text.split()
|
29 |
+
processed_words = []
|
30 |
+
|
31 |
+
for i, word in enumerate(words):
|
32 |
+
# Check if the word is a title (e.g., Mr., Mrs.)
|
33 |
+
if word in TITLES:
|
34 |
+
processed_words.append(word)
|
35 |
+
# Check if the word ends with a period and is not followed by a quote
|
36 |
+
elif word.endswith('.') and not word.endswith('."'):
|
37 |
+
processed_words.append(word + '\n')
|
38 |
+
else:
|
39 |
+
processed_words.append(word)
|
40 |
+
|
41 |
+
return ' '.join(processed_words)
|
42 |
+
|
43 |
def combine_dataset_texts(dataset_name, split, text_column):
|
44 |
try:
|
45 |
# Load the dataset from Hugging Face Hub
|
|
|
52 |
# Combine all texts into a single string without separating datapoints
|
53 |
combined_text = " ".join([example[text_column] for example in dataset])
|
54 |
|
55 |
+
# Clean the text: remove non-Latin and **
|
56 |
cleaned_text = clean_text(combined_text)
|
57 |
|
58 |
+
# Process the text: insert newlines after periods, except for titles and ."
|
59 |
+
processed_text = process_text(cleaned_text)
|
60 |
|
61 |
# Create a temporary file
|
62 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
|