TeacherPuffy commited on
Commit
f108df6
·
verified ·
1 Parent(s): 73d12a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -13
app.py CHANGED
@@ -2,26 +2,44 @@ import gradio as gr
2
  from datasets import load_dataset
3
  import tempfile
4
  import re
5
- from langdetect import detect
6
 
7
- def is_english(text):
8
- """Check if the text is in English."""
9
- try:
10
- return detect(text) == 'en'
11
- except:
12
- return False
 
13
 
14
  def clean_text(text):
15
- """Remove non-English text and ** from the text."""
16
  # Remove **
17
  text = re.sub(r'\*\*', '', text)
18
 
19
- # Split text into sentences and filter out non-English sentences
20
  sentences = re.split(r'(?<=[.!?])\s+', text)
21
- cleaned_sentences = [s for s in sentences if is_english(s)]
22
 
23
  return ' '.join(cleaned_sentences)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def combine_dataset_texts(dataset_name, split, text_column):
26
  try:
27
  # Load the dataset from Hugging Face Hub
@@ -34,11 +52,11 @@ def combine_dataset_texts(dataset_name, split, text_column):
34
  # Combine all texts into a single string without separating datapoints
35
  combined_text = " ".join([example[text_column] for example in dataset])
36
 
37
- # Clean the text: remove non-English and **
38
  cleaned_text = clean_text(combined_text)
39
 
40
- # Insert a newline after each period (.) except for ."
41
- processed_text = re.sub(r'\.(?!")', '.\n', cleaned_text)
42
 
43
  # Create a temporary file
44
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
 
2
  from datasets import load_dataset
3
  import tempfile
4
  import re
 
5
 
6
+ # List of common titles that end with a period
7
+ TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."}
8
+
9
+ def is_latin(text):
10
+ """Check if the text contains only Latin characters."""
11
+ # Regex to match non-Latin characters
12
+ return not re.search(r'[^\x00-\x7F]', text)
13
 
14
  def clean_text(text):
15
+ """Remove non-Latin text and ** from the text."""
16
  # Remove **
17
  text = re.sub(r'\*\*', '', text)
18
 
19
+ # Split text into sentences and filter out non-Latin sentences
20
  sentences = re.split(r'(?<=[.!?])\s+', text)
21
+ cleaned_sentences = [s for s in sentences if is_latin(s)]
22
 
23
  return ' '.join(cleaned_sentences)
24
 
25
+ def process_text(text):
26
+ """Insert a newline after periods, except for titles and ." """
27
+ # Split text into words
28
+ words = text.split()
29
+ processed_words = []
30
+
31
+ for i, word in enumerate(words):
32
+ # Check if the word is a title (e.g., Mr., Mrs.)
33
+ if word in TITLES:
34
+ processed_words.append(word)
35
+ # Check if the word ends with a period and is not followed by a quote
36
+ elif word.endswith('.') and not word.endswith('."'):
37
+ processed_words.append(word + '\n')
38
+ else:
39
+ processed_words.append(word)
40
+
41
+ return ' '.join(processed_words)
42
+
43
  def combine_dataset_texts(dataset_name, split, text_column):
44
  try:
45
  # Load the dataset from Hugging Face Hub
 
52
  # Combine all texts into a single string without separating datapoints
53
  combined_text = " ".join([example[text_column] for example in dataset])
54
 
55
+ # Clean the text: remove non-Latin and **
56
  cleaned_text = clean_text(combined_text)
57
 
58
+ # Process the text: insert newlines after periods, except for titles and ."
59
+ processed_text = process_text(cleaned_text)
60
 
61
  # Create a temporary file
62
  with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: