Spaces:
Sleeping
Sleeping
import re | |
from difflib import SequenceMatcher | |
from collections import defaultdict | |
def extract_special_characters(text): | |
"""Extracts all unique special characters from a list of texts.""" | |
characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters | |
return ''.join(characters) | |
def clean_text(text, keep=""): | |
"""Removes special characters except those specified in 'keep', and converts to lowercase.""" | |
pattern = rf'[^\w\s{re.escape(keep)}]' | |
return re.sub(pattern, '', text.lower()) | |
def text_similarity(text, key_text): | |
"""Calculates the similarity between two texts using SequenceMatcher.""" | |
return SequenceMatcher(None, text, key_text).ratio() | |
def detect_fragments(text, key_texts, threshold=0.7): | |
"""Checks if a text contains fragments of key texts.""" | |
for key_text in key_texts: | |
characters_to_not_clean = extract_special_characters(key_text) | |
words = clean_text(text, characters_to_not_clean).split() | |
key_words = key_text.split() | |
# If the text is too short, we can't make an effective sliding window | |
if len(words) < len(key_words): | |
similarity = text_similarity(text, key_text) | |
if similarity >= threshold: | |
return True, key_text, similarity | |
continue | |
# Sliding window to compare word sequences | |
for i in range(len(words) - len(key_words) + 1): | |
fragment = " ".join(words[i:i+len(key_words)]) | |
similarity = text_similarity(fragment, key_text) | |
if similarity >= threshold: | |
return True, key_text, similarity | |
return False, None, 0 | |
def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7): | |
""" | |
Analyzes the similarity between a list of texts and key texts. | |
Returns a detailed report on the similarities found. | |
""" | |
results = { | |
"similar_texts": [], | |
"fragments_detected": [], | |
"combined": [], | |
"statistics": defaultdict(int) | |
} | |
processed_texts = set() | |
# Check direct similarity | |
for i, text in enumerate(text_list): | |
if not text.strip(): | |
continue | |
for key_text in key_texts: | |
if not key_text.strip(): | |
continue | |
similarity = text_similarity(text, key_text) | |
if similarity >= similarity_threshold: | |
results["similar_texts"].append({ | |
"index": i, | |
"text": text, | |
"key_text": key_text, | |
"similarity": similarity | |
}) | |
results["statistics"]["direct_similarity"] += 1 | |
processed_texts.add(i) | |
# Check fragments | |
# for i, text in enumerate(text_list): | |
# if i in processed_texts or not text.strip(): | |
# continue | |
# has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold) | |
# if has_fragment: | |
# results["fragments_detected"].append({ | |
# "index": i, | |
# "text": text, | |
# "key_text": key_text, | |
# "similarity": similarity | |
# }) | |
# results["statistics"]["fragments"] += 1 | |
# processed_texts.add(i) | |
# Check texts that can be combined | |
for i in range(len(text_list)): | |
if i in processed_texts or not text_list[i].strip(): | |
continue | |
for j in range(i+1, len(text_list)): | |
if j in processed_texts or not text_list[j].strip(): | |
continue | |
combined_text = text_list[i] + " " + text_list[j] | |
for key_text in key_texts: | |
if not key_text.strip(): | |
continue | |
similarity = text_similarity(combined_text, key_text) | |
if similarity >= similarity_threshold: | |
results["combined"].append({ | |
"indices": [i, j], | |
"texts": [text_list[i], text_list[j]], | |
"combined_text": combined_text, | |
"key_text": key_text, | |
"similarity": similarity | |
}) | |
results["statistics"]["combined"] += 1 | |
processed_texts.add(i) | |
processed_texts.add(j) | |
break | |
# Calculate overall statistics | |
valid_texts = sum(1 for text in text_list if text.strip()) | |
results["statistics"]["total_analyzed"] = valid_texts | |
results["statistics"]["total_processed"] = len(processed_texts) | |
return results | |