Update text_similarity.py
Browse files- text_similarity.py +5 -0
text_similarity.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
import re
|
|
|
2 |
from difflib import SequenceMatcher
|
3 |
from collections import defaultdict
|
4 |
|
|
|
|
|
5 |
def extract_special_characters(text):
|
6 |
"""Extracts all unique special characters from a list of texts."""
|
7 |
characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
|
@@ -22,6 +25,8 @@ def detect_fragments(text, key_texts, threshold=0.7):
|
|
22 |
characters_to_not_clean = extract_special_characters(key_text)
|
23 |
words = clean_text(text, characters_to_not_clean).split()
|
24 |
|
|
|
|
|
25 |
key_words = key_text.split()
|
26 |
|
27 |
# If the text is too short, we can't make an effective sliding window
|
|
|
1 |
import re
|
2 |
+
import logging
|
3 |
from difflib import SequenceMatcher
|
4 |
from collections import defaultdict
|
5 |
|
6 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
7 |
+
|
8 |
def extract_special_characters(text):
|
9 |
"""Extracts all unique special characters from a list of texts."""
|
10 |
characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
|
|
|
25 |
characters_to_not_clean = extract_special_characters(key_text)
|
26 |
words = clean_text(text, characters_to_not_clean).split()
|
27 |
|
28 |
+
logging.info(f"Words detected: {words}")
|
29 |
+
|
30 |
key_words = key_text.split()
|
31 |
|
32 |
# If the text is too short, we can't make an effective sliding window
|