MarioPrzBasto commited on
Commit
506344e
·
verified ·
1 Parent(s): 3381080

Update text_similarity.py

Browse files
Files changed (1) hide show
  1. text_similarity.py +5 -0
text_similarity.py CHANGED
@@ -1,7 +1,10 @@
1
  import re
 
2
  from difflib import SequenceMatcher
3
  from collections import defaultdict
4
 
 
 
5
  def extract_special_characters(text):
6
  """Extracts all unique special characters from a list of texts."""
7
  characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
@@ -22,6 +25,8 @@ def detect_fragments(text, key_texts, threshold=0.7):
22
  characters_to_not_clean = extract_special_characters(key_text)
23
  words = clean_text(text, characters_to_not_clean).split()
24
 
 
 
25
  key_words = key_text.split()
26
 
27
  # If the text is too short, we can't make an effective sliding window
 
1
  import re
2
+ import logging
3
  from difflib import SequenceMatcher
4
  from collections import defaultdict
5
 
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
+
8
  def extract_special_characters(text):
9
  """Extracts all unique special characters from a list of texts."""
10
  characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
 
25
  characters_to_not_clean = extract_special_characters(key_text)
26
  words = clean_text(text, characters_to_not_clean).split()
27
 
28
+ logging.info(f"Words detected: {words}")
29
+
30
  key_words = key_text.split()
31
 
32
  # If the text is too short, we can't make an effective sliding window