cnmoro commited on
Commit
3c78773
1 Parent(s): dc21de1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -2
app.py CHANGED
@@ -1,8 +1,8 @@
1
  from sklearn.feature_extraction.text import CountVectorizer
2
  from sklearn.decomposition import LatentDirichletAllocation
 
3
  from minivectordb.embedding_model import EmbeddingModel
4
  from sklearn.metrics.pairwise import cosine_similarity
5
- import tiktoken, nltk, numpy as np, fasttext, pickle
6
  from nltk.tokenize import sent_tokenize
7
  import gradio as gr
8
 
@@ -22,6 +22,62 @@ def detect_language(text):
22
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
23
  return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
26
  def calculate_similarity(embed1, embed2):
27
  return cosine_similarity([embed1], [embed2])[0][0]
@@ -94,7 +150,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
94
  # Reorder sentences to maintain original flow
95
  compressed_text.sort(key=lambda x: sentences.index(x))
96
 
97
- return ' '.join(compressed_text)
 
 
98
 
99
  async def predict(text, word_reduction_factor):
100
  if len(text.split()) > 5000:
 
1
  from sklearn.feature_extraction.text import CountVectorizer
2
  from sklearn.decomposition import LatentDirichletAllocation
3
+ import tiktoken, nltk, numpy as np, fasttext, pickle, re
4
  from minivectordb.embedding_model import EmbeddingModel
5
  from sklearn.metrics.pairwise import cosine_similarity
 
6
  from nltk.tokenize import sent_tokenize
7
  import gradio as gr
8
 
 
22
  detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
23
  return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
24
 
25
+ def clean_and_standardize_text(text):
26
+ # 1. Standardize spacing around punctuation
27
+ text = re.sub(r'\s([.,;:!?])\s', r'\1 ', text)
28
+
29
+ # 2. Remove extra spaces
30
+ text = re.sub(r'\s+', ' ', text).strip()
31
+
32
+ # 3. Capitalize sentences
33
+ sentences = sent_tokenize(text)
34
+ text = '. '.join(sentence.capitalize() for sentence in sentences)
35
+
36
+ # 4. Standardize number formatting
37
+ text = re.sub(r'(\d+)\s+(\d+)', r'\1.\2', text)
38
+
39
+ # 5. Ensure proper spacing after closing parentheses
40
+ text = re.sub(r'\)\s*([a-zA-Z])', r') \1', text)
41
+
42
+ # 6. Preserve bullet points
43
+ text = re.sub(r'•\s*', '• ', text)
44
+
45
+ # 7. Preserve numbered lists
46
+ text = re.sub(r'(\d+)\.\s*', r'\1. ', text)
47
+
48
+ # 8. Standardize date formatting
49
+ text = re.sub(r'(\d{2})\s+(\d{2})\s+(\d{4})', r'\1/\2/\3', text)
50
+
51
+ # 9. Remove extra periods
52
+ text = re.sub(r'\.\s+\.', '. ', text)
53
+
54
+ # 10. Remove spacing around parentheses
55
+ text = re.sub(r'\(\s*', '(', text)
56
+ text = re.sub(r'\s*\)', ')', text)
57
+
58
+ # 11. Remove extra numbers without meaning
59
+ text = re.sub(r'\b(\d+)\b', '', text)
60
+
61
+ # 12. Improve spacing around punctuations
62
+ while ' .' in text:
63
+ text = text.replace(' .', '.')
64
+
65
+ while '..' in text:
66
+ text = text.replace('..', '.')
67
+
68
+ while ' ' in text:
69
+ text = text.replace(' ', ' ')
70
+
71
+ text = text.replace(' :', ':')
72
+ text = text.replace('- -', '-')
73
+ text = text.replace('. -', '.')
74
+
75
+ # 13. Detect two punctuation marks in a row, keeping the last
76
+ text = re.sub(r'([.,]){2,}', r'\1', text)
77
+ text = re.sub(r'(?<=[:.])[:.]+', '', text)
78
+
79
+ return text
80
+
81
  def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
82
  def calculate_similarity(embed1, embed2):
83
  return cosine_similarity([embed1], [embed2])[0][0]
 
150
  # Reorder sentences to maintain original flow
151
  compressed_text.sort(key=lambda x: sentences.index(x))
152
 
153
+ joined_compressed_text = ' '.join(compressed_text)
154
+ joined_compressed_text_cleaned = clean_and_standardize_text(joined_compressed_text)
155
+ return joined_compressed_text_cleaned
156
 
157
  async def predict(text, word_reduction_factor):
158
  if len(text.split()) > 5000: