Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from sklearn.feature_extraction.text import CountVectorizer
|
2 |
from sklearn.decomposition import LatentDirichletAllocation
|
|
|
3 |
from minivectordb.embedding_model import EmbeddingModel
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
-
import tiktoken, nltk, numpy as np, fasttext, pickle
|
6 |
from nltk.tokenize import sent_tokenize
|
7 |
import gradio as gr
|
8 |
|
@@ -22,6 +22,62 @@ def detect_language(text):
|
|
22 |
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
23 |
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
26 |
def calculate_similarity(embed1, embed2):
|
27 |
return cosine_similarity([embed1], [embed2])[0][0]
|
@@ -94,7 +150,9 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
|
94 |
# Reorder sentences to maintain original flow
|
95 |
compressed_text.sort(key=lambda x: sentences.index(x))
|
96 |
|
97 |
-
|
|
|
|
|
98 |
|
99 |
async def predict(text, word_reduction_factor):
|
100 |
if len(text.split()) > 5000:
|
|
|
1 |
from sklearn.feature_extraction.text import CountVectorizer
|
2 |
from sklearn.decomposition import LatentDirichletAllocation
|
3 |
+
import tiktoken, nltk, numpy as np, fasttext, pickle, re
|
4 |
from minivectordb.embedding_model import EmbeddingModel
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
6 |
from nltk.tokenize import sent_tokenize
|
7 |
import gradio as gr
|
8 |
|
|
|
22 |
detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
|
23 |
return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
|
24 |
|
25 |
+
def clean_and_standardize_text(text):
|
26 |
+
# 1. Standardize spacing around punctuation
|
27 |
+
text = re.sub(r'\s([.,;:!?])\s', r'\1 ', text)
|
28 |
+
|
29 |
+
# 2. Remove extra spaces
|
30 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
31 |
+
|
32 |
+
# 3. Capitalize sentences
|
33 |
+
sentences = sent_tokenize(text)
|
34 |
+
text = '. '.join(sentence.capitalize() for sentence in sentences)
|
35 |
+
|
36 |
+
# 4. Standardize number formatting
|
37 |
+
text = re.sub(r'(\d+)\s+(\d+)', r'\1.\2', text)
|
38 |
+
|
39 |
+
# 5. Ensure proper spacing after closing parentheses
|
40 |
+
text = re.sub(r'\)\s*([a-zA-Z])', r') \1', text)
|
41 |
+
|
42 |
+
# 6. Preserve bullet points
|
43 |
+
text = re.sub(r'•\s*', '• ', text)
|
44 |
+
|
45 |
+
# 7. Preserve numbered lists
|
46 |
+
text = re.sub(r'(\d+)\.\s*', r'\1. ', text)
|
47 |
+
|
48 |
+
# 8. Standardize date formatting
|
49 |
+
text = re.sub(r'(\d{2})\s+(\d{2})\s+(\d{4})', r'\1/\2/\3', text)
|
50 |
+
|
51 |
+
# 9. Remove extra periods
|
52 |
+
text = re.sub(r'\.\s+\.', '. ', text)
|
53 |
+
|
54 |
+
# 10. Remove spacing around parentheses
|
55 |
+
text = re.sub(r'\(\s*', '(', text)
|
56 |
+
text = re.sub(r'\s*\)', ')', text)
|
57 |
+
|
58 |
+
# 11. Remove extra numbers without meaning
|
59 |
+
text = re.sub(r'\b(\d+)\b', '', text)
|
60 |
+
|
61 |
+
# 12. Improve spacing around punctuations
|
62 |
+
while ' .' in text:
|
63 |
+
text = text.replace(' .', '.')
|
64 |
+
|
65 |
+
while '..' in text:
|
66 |
+
text = text.replace('..', '.')
|
67 |
+
|
68 |
+
while ' ' in text:
|
69 |
+
text = text.replace(' ', ' ')
|
70 |
+
|
71 |
+
text = text.replace(' :', ':')
|
72 |
+
text = text.replace('- -', '-')
|
73 |
+
text = text.replace('. -', '.')
|
74 |
+
|
75 |
+
# 13. Detect two punctuation marks in a row, keeping the last
|
76 |
+
text = re.sub(r'([.,]){2,}', r'\1', text)
|
77 |
+
text = re.sub(r'(?<=[:.])[:.]+', '', text)
|
78 |
+
|
79 |
+
return text
|
80 |
+
|
81 |
def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
|
82 |
def calculate_similarity(embed1, embed2):
|
83 |
return cosine_similarity([embed1], [embed2])[0][0]
|
|
|
150 |
# Reorder sentences to maintain original flow
|
151 |
compressed_text.sort(key=lambda x: sentences.index(x))
|
152 |
|
153 |
+
joined_compressed_text = ' '.join(compressed_text)
|
154 |
+
joined_compressed_text_cleaned = clean_and_standardize_text(joined_compressed_text)
|
155 |
+
return joined_compressed_text_cleaned
|
156 |
|
157 |
async def predict(text, word_reduction_factor):
|
158 |
if len(text.split()) > 5000:
|