|
import re |
|
from difflib import SequenceMatcher |
|
from collections import defaultdict |
|
|
|
def extract_special_characters(text): |
|
"""Extracts all unique special characters from a list of texts.""" |
|
characters = re.findall(r'[^\w\s]', text) |
|
return ''.join(characters) |
|
|
|
def clean_text(text, keep=""): |
|
"""Removes special characters except those specified in 'keep', and converts to lowercase.""" |
|
pattern = rf'[^\w\s{re.escape(keep)}]' |
|
return re.sub(pattern, '', text.lower()) |
|
|
|
def text_similarity(text, key_text): |
|
"""Calculates the similarity between two texts using SequenceMatcher.""" |
|
return SequenceMatcher(None, text, key_text).ratio() |
|
|
|
def detect_fragments(text, key_texts, threshold=0.7): |
|
"""Checks if a text contains fragments of key texts.""" |
|
for key_text in key_texts: |
|
characters_to_not_clean = extract_special_characters(key_text) |
|
words = clean_text(text, characters_to_not_clean).split() |
|
|
|
key_words = key_text.split() |
|
|
|
|
|
if len(words) < len(key_words): |
|
similarity = text_similarity(text, key_text) |
|
if similarity >= threshold: |
|
return True, key_text, similarity |
|
continue |
|
|
|
|
|
for i in range(len(words) - len(key_words) + 1): |
|
fragment = " ".join(words[i:i+len(key_words)]) |
|
similarity = text_similarity(fragment, key_text) |
|
if similarity >= threshold: |
|
return True, key_text, similarity |
|
return False, None, 0 |
|
|
|
def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7): |
|
""" |
|
Analyzes the similarity between a list of texts and key texts. |
|
Returns a detailed report on the similarities found. |
|
""" |
|
results = { |
|
"similar_texts": [], |
|
"fragments_detected": [], |
|
"combined": [], |
|
"statistics": defaultdict(int) |
|
} |
|
|
|
processed_texts = set() |
|
|
|
|
|
for i, text in enumerate(text_list): |
|
if not text.strip(): |
|
continue |
|
|
|
for key_text in key_texts: |
|
if not key_text.strip(): |
|
continue |
|
|
|
similarity = text_similarity(text, key_text) |
|
if similarity >= similarity_threshold: |
|
results["similar_texts"].append({ |
|
"index": i, |
|
"text": text, |
|
"key_text": key_text, |
|
"similarity": similarity |
|
}) |
|
results["statistics"]["direct_similarity"] += 1 |
|
processed_texts.add(i) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(text_list)): |
|
if i in processed_texts or not text_list[i].strip(): |
|
continue |
|
|
|
for j in range(i+1, len(text_list)): |
|
if j in processed_texts or not text_list[j].strip(): |
|
continue |
|
|
|
combined_text = text_list[i] + " " + text_list[j] |
|
for key_text in key_texts: |
|
if not key_text.strip(): |
|
continue |
|
|
|
similarity = text_similarity(combined_text, key_text) |
|
if similarity >= similarity_threshold: |
|
results["combined"].append({ |
|
"indices": [i, j], |
|
"texts": [text_list[i], text_list[j]], |
|
"combined_text": combined_text, |
|
"key_text": key_text, |
|
"similarity": similarity |
|
}) |
|
results["statistics"]["combined"] += 1 |
|
processed_texts.add(i) |
|
processed_texts.add(j) |
|
break |
|
|
|
|
|
valid_texts = sum(1 for text in text_list if text.strip()) |
|
results["statistics"]["total_analyzed"] = valid_texts |
|
results["statistics"]["total_processed"] = len(processed_texts) |
|
|
|
return results |
|
|