File size: 4,775 Bytes
2f73fd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
from difflib import SequenceMatcher
from collections import defaultdict

def extract_special_characters(text):
    """Extracts all unique special characters from a list of texts."""
    characters = re.findall(r'[^\w\s]', text)  # Finds non-alphanumeric and non-space characters
    return ''.join(characters)

def clean_text(text, keep=""):
    """Removes special characters except those specified in 'keep', and converts to lowercase."""
    pattern = rf'[^\w\s{re.escape(keep)}]'
    return re.sub(pattern, '', text.lower())

def text_similarity(text, key_text):
    """Calculates the similarity between two texts using SequenceMatcher."""
    return SequenceMatcher(None, text, key_text).ratio()

def detect_fragments(text, key_texts, threshold=0.7):
    """Checks if a text contains fragments of key texts."""
    for key_text in key_texts:
        characters_to_not_clean = extract_special_characters(key_text)
        words = clean_text(text, characters_to_not_clean).split()

        key_words = key_text.split()
        
        # If the text is too short, we can't make an effective sliding window
        if len(words) < len(key_words):
            similarity = text_similarity(text, key_text)
            if similarity >= threshold:
                return True, key_text, similarity
            continue
        
        # Sliding window to compare word sequences
        for i in range(len(words) - len(key_words) + 1):
            fragment = " ".join(words[i:i+len(key_words)])
            similarity = text_similarity(fragment, key_text)
            if similarity >= threshold:
                return True, key_text, similarity
    return False, None, 0

def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
    """
    Analyzes the similarity between a list of texts and key texts.
    Returns a detailed report on the similarities found.
    """
    results = {
        "similar_texts": [],
        "fragments_detected": [],
        "combined": [],
        "statistics": defaultdict(int)
    }
    
    processed_texts = set()
    
    # Check direct similarity
    for i, text in enumerate(text_list):
        if not text.strip():
            continue
            
        for key_text in key_texts:
            if not key_text.strip():
                continue
                
            similarity = text_similarity(text, key_text)
            if similarity >= similarity_threshold:
                results["similar_texts"].append({
                    "index": i,
                    "text": text,
                    "key_text": key_text,
                    "similarity": similarity
                })
                results["statistics"]["direct_similarity"] += 1
                processed_texts.add(i)
    
    # Check fragments
    # for i, text in enumerate(text_list):
    #     if i in processed_texts or not text.strip():
    #         continue
            
    #     has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
    #     if has_fragment:
    #         results["fragments_detected"].append({
    #             "index": i,
    #             "text": text,
    #             "key_text": key_text,
    #             "similarity": similarity
    #         })
    #         results["statistics"]["fragments"] += 1
    #         processed_texts.add(i)
    
    # Check texts that can be combined
    for i in range(len(text_list)):
        if i in processed_texts or not text_list[i].strip():
            continue
            
        for j in range(i+1, len(text_list)):
            if j in processed_texts or not text_list[j].strip():
                continue
                
            combined_text = text_list[i] + " " + text_list[j]
            for key_text in key_texts:
                if not key_text.strip():
                    continue
                    
                similarity = text_similarity(combined_text, key_text)
                if similarity >= similarity_threshold:
                    results["combined"].append({
                        "indices": [i, j],
                        "texts": [text_list[i], text_list[j]],
                        "combined_text": combined_text,
                        "key_text": key_text,
                        "similarity": similarity
                    })
                    results["statistics"]["combined"] += 1
                    processed_texts.add(i)
                    processed_texts.add(j)
                    break
    
    # Calculate overall statistics
    valid_texts = sum(1 for text in text_list if text.strip())
    results["statistics"]["total_analyzed"] = valid_texts
    results["statistics"]["total_processed"] = len(processed_texts)
    
    return results