File size: 4,775 Bytes
2f73fd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import re
from difflib import SequenceMatcher
from collections import defaultdict
def extract_special_characters(text):
"""Extracts all unique special characters from a list of texts."""
characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
return ''.join(characters)
def clean_text(text, keep=""):
"""Removes special characters except those specified in 'keep', and converts to lowercase."""
pattern = rf'[^\w\s{re.escape(keep)}]'
return re.sub(pattern, '', text.lower())
def text_similarity(text, key_text):
"""Calculates the similarity between two texts using SequenceMatcher."""
return SequenceMatcher(None, text, key_text).ratio()
def detect_fragments(text, key_texts, threshold=0.7):
"""Checks if a text contains fragments of key texts."""
for key_text in key_texts:
characters_to_not_clean = extract_special_characters(key_text)
words = clean_text(text, characters_to_not_clean).split()
key_words = key_text.split()
# If the text is too short, we can't make an effective sliding window
if len(words) < len(key_words):
similarity = text_similarity(text, key_text)
if similarity >= threshold:
return True, key_text, similarity
continue
# Sliding window to compare word sequences
for i in range(len(words) - len(key_words) + 1):
fragment = " ".join(words[i:i+len(key_words)])
similarity = text_similarity(fragment, key_text)
if similarity >= threshold:
return True, key_text, similarity
return False, None, 0
def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
"""
Analyzes the similarity between a list of texts and key texts.
Returns a detailed report on the similarities found.
"""
results = {
"similar_texts": [],
"fragments_detected": [],
"combined": [],
"statistics": defaultdict(int)
}
processed_texts = set()
# Check direct similarity
for i, text in enumerate(text_list):
if not text.strip():
continue
for key_text in key_texts:
if not key_text.strip():
continue
similarity = text_similarity(text, key_text)
if similarity >= similarity_threshold:
results["similar_texts"].append({
"index": i,
"text": text,
"key_text": key_text,
"similarity": similarity
})
results["statistics"]["direct_similarity"] += 1
processed_texts.add(i)
# Check fragments
# for i, text in enumerate(text_list):
# if i in processed_texts or not text.strip():
# continue
# has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
# if has_fragment:
# results["fragments_detected"].append({
# "index": i,
# "text": text,
# "key_text": key_text,
# "similarity": similarity
# })
# results["statistics"]["fragments"] += 1
# processed_texts.add(i)
# Check texts that can be combined
for i in range(len(text_list)):
if i in processed_texts or not text_list[i].strip():
continue
for j in range(i+1, len(text_list)):
if j in processed_texts or not text_list[j].strip():
continue
combined_text = text_list[i] + " " + text_list[j]
for key_text in key_texts:
if not key_text.strip():
continue
similarity = text_similarity(combined_text, key_text)
if similarity >= similarity_threshold:
results["combined"].append({
"indices": [i, j],
"texts": [text_list[i], text_list[j]],
"combined_text": combined_text,
"key_text": key_text,
"similarity": similarity
})
results["statistics"]["combined"] += 1
processed_texts.add(i)
processed_texts.add(j)
break
# Calculate overall statistics
valid_texts = sum(1 for text in text_list if text.strip())
results["statistics"]["total_analyzed"] = valid_texts
results["statistics"]["total_processed"] = len(processed_texts)
return results
|