import re from difflib import SequenceMatcher from collections import defaultdict def extract_special_characters(text): """Extracts all unique special characters from a list of texts.""" characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters return ''.join(characters) def clean_text(text, keep=""): """Removes special characters except those specified in 'keep', and converts to lowercase.""" pattern = rf'[^\w\s{re.escape(keep)}]' return re.sub(pattern, '', text.lower()) def text_similarity(text, key_text): """Calculates the similarity between two texts using SequenceMatcher.""" return SequenceMatcher(None, text, key_text).ratio() def detect_fragments(text, key_texts, threshold=0.7): """Checks if a text contains fragments of key texts.""" for key_text in key_texts: characters_to_not_clean = extract_special_characters(key_text) words = clean_text(text, characters_to_not_clean).split() key_words = key_text.split() # If the text is too short, we can't make an effective sliding window if len(words) < len(key_words): similarity = text_similarity(text, key_text) if similarity >= threshold: return True, key_text, similarity continue # Sliding window to compare word sequences for i in range(len(words) - len(key_words) + 1): fragment = " ".join(words[i:i+len(key_words)]) similarity = text_similarity(fragment, key_text) if similarity >= threshold: return True, key_text, similarity return False, None, 0 def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7): """ Analyzes the similarity between a list of texts and key texts. Returns a detailed report on the similarities found. """ results = { "similar_texts": [], "fragments_detected": [], "combined": [], "statistics": defaultdict(int) } processed_texts = set() # Check direct similarity for i, text in enumerate(text_list): if not text.strip(): continue for key_text in key_texts: if not key_text.strip(): continue similarity = text_similarity(text, key_text) if similarity >= similarity_threshold: results["similar_texts"].append({ "index": i, "text": text, "key_text": key_text, "similarity": similarity }) results["statistics"]["direct_similarity"] += 1 processed_texts.add(i) # Check fragments # for i, text in enumerate(text_list): # if i in processed_texts or not text.strip(): # continue # has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold) # if has_fragment: # results["fragments_detected"].append({ # "index": i, # "text": text, # "key_text": key_text, # "similarity": similarity # }) # results["statistics"]["fragments"] += 1 # processed_texts.add(i) # Check texts that can be combined for i in range(len(text_list)): if i in processed_texts or not text_list[i].strip(): continue for j in range(i+1, len(text_list)): if j in processed_texts or not text_list[j].strip(): continue combined_text = text_list[i] + " " + text_list[j] for key_text in key_texts: if not key_text.strip(): continue similarity = text_similarity(combined_text, key_text) if similarity >= similarity_threshold: results["combined"].append({ "indices": [i, j], "texts": [text_list[i], text_list[j]], "combined_text": combined_text, "key_text": key_text, "similarity": similarity }) results["statistics"]["combined"] += 1 processed_texts.add(i) processed_texts.add(j) break # Calculate overall statistics valid_texts = sum(1 for text in text_list if text.strip()) results["statistics"]["total_analyzed"] = valid_texts results["statistics"]["total_processed"] = len(processed_texts) return results