File size: 4,933 Bytes
2f73fd7 506344e 2f73fd7 506344e 2f73fd7 506344e 2f73fd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import re
import logging
from difflib import SequenceMatcher
from collections import defaultdict
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def extract_special_characters(text):
"""Extracts all unique special characters from a list of texts."""
characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
return ''.join(characters)
def clean_text(text, keep=""):
"""Removes special characters except those specified in 'keep', and converts to lowercase."""
pattern = rf'[^\w\s{re.escape(keep)}]'
return re.sub(pattern, '', text.lower())
def text_similarity(text, key_text):
"""Calculates the similarity between two texts using SequenceMatcher."""
return SequenceMatcher(None, text, key_text).ratio()
def detect_fragments(text, key_texts, threshold=0.7):
"""Checks if a text contains fragments of key texts."""
for key_text in key_texts:
characters_to_not_clean = extract_special_characters(key_text)
words = clean_text(text, characters_to_not_clean).split()
logging.info(f"Words detected: {words}")
key_words = key_text.split()
# If the text is too short, we can't make an effective sliding window
if len(words) < len(key_words):
similarity = text_similarity(text, key_text)
if similarity >= threshold:
return True, key_text, similarity
continue
# Sliding window to compare word sequences
for i in range(len(words) - len(key_words) + 1):
fragment = " ".join(words[i:i+len(key_words)])
similarity = text_similarity(fragment, key_text)
if similarity >= threshold:
return True, key_text, similarity
return False, None, 0
def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
"""
Analyzes the similarity between a list of texts and key texts.
Returns a detailed report on the similarities found.
"""
results = {
"similar_texts": [],
"fragments_detected": [],
"combined": [],
"statistics": defaultdict(int)
}
processed_texts = set()
# Check direct similarity
for i, text in enumerate(text_list):
if not text.strip():
continue
for key_text in key_texts:
if not key_text.strip():
continue
similarity = text_similarity(text, key_text)
if similarity >= similarity_threshold:
results["similar_texts"].append({
"index": i,
"text": text,
"key_text": key_text,
"similarity": similarity
})
results["statistics"]["direct_similarity"] += 1
processed_texts.add(i)
# Check fragments
# for i, text in enumerate(text_list):
# if i in processed_texts or not text.strip():
# continue
# has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
# if has_fragment:
# results["fragments_detected"].append({
# "index": i,
# "text": text,
# "key_text": key_text,
# "similarity": similarity
# })
# results["statistics"]["fragments"] += 1
# processed_texts.add(i)
# Check texts that can be combined
for i in range(len(text_list)):
if i in processed_texts or not text_list[i].strip():
continue
for j in range(i+1, len(text_list)):
if j in processed_texts or not text_list[j].strip():
continue
combined_text = text_list[i] + " " + text_list[j]
for key_text in key_texts:
if not key_text.strip():
continue
similarity = text_similarity(combined_text, key_text)
if similarity >= similarity_threshold:
results["combined"].append({
"indices": [i, j],
"texts": [text_list[i], text_list[j]],
"combined_text": combined_text,
"key_text": key_text,
"similarity": similarity
})
results["statistics"]["combined"] += 1
processed_texts.add(i)
processed_texts.add(j)
break
# Calculate overall statistics
valid_texts = sum(1 for text in text_list if text.strip())
results["statistics"]["total_analyzed"] = valid_texts
results["statistics"]["total_processed"] = len(processed_texts)
return results
|