File size: 6,328 Bytes
990f77e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pandas as pd
import networkx as nx
import spacy
import pickle
from datetime import datetime
import os

# Load spaCy for NER
nlp = spacy.load("en_core_web_sm")

class KnowledgeGraphBuilder:
    def __init__(self, model_dir="models"):
        self.model_dir = model_dir
        self.knowledge_graph = nx.DiGraph()
    
    def extract_entities(self, text):
        """Extract named entities from text using spaCy"""
        try:
            # Convert to string and handle NaN/None values
            if pd.isna(text) or text is None:
                return []
            
            # Convert float or int to string if necessary
            if isinstance(text, (float, int)):
                text = str(text)
            
            # Ensure text is a string
            text = str(text).strip()
            
            # Skip empty strings
            if not text:
                return []
                
            doc = nlp(text)
            entities = [(ent.text, ent.label_) for ent in doc.ents]
            return entities
        except Exception as e:
            print(f"Error processing text: {text}")
            print(f"Error message: {str(e)}")
            return []

    def update_knowledge_graph(self, text, is_real):
        """Update knowledge graph with entities and their relationships"""
        try:
            entities = self.extract_entities(text)
            
            # Skip if no entities were found
            if not entities:
                return
            
            # Add nodes and edges to the graph
            for entity, entity_type in entities:
                # Add node if it doesn't exist
                if not self.knowledge_graph.has_node(entity):
                    self.knowledge_graph.add_node(
                        entity,
                        type=entity_type,
                        real_count=1 if is_real else 0,
                        fake_count=0 if is_real else 1
                    )
                else:
                    # Update counts
                    if is_real:
                        self.knowledge_graph.nodes[entity]['real_count'] += 1
                    else:
                        self.knowledge_graph.nodes[entity]['fake_count'] += 1
            
            # Add edges between entities in the same text
            for i, (entity1, _) in enumerate(entities):
                for entity2, _ in entities[i+1:]:
                    if not self.knowledge_graph.has_edge(entity1, entity2):
                        self.knowledge_graph.add_edge(
                            entity1,
                            entity2,
                            weight=1,
                            is_real=is_real
                        )
                    else:
                        self.knowledge_graph[entity1][entity2]['weight'] += 1
        except Exception as e:
            print(f"Error updating knowledge graph: {str(e)}")

    def save_knowledge_graph(self, filename=None):
        """Save the knowledge graph to a file"""
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = os.path.join(self.model_dir, f"knowledge_graph_{timestamp}.pkl")
        
        os.makedirs(self.model_dir, exist_ok=True)
        
        # Convert the graph to a dictionary format for better serialization
        graph_data = {
            'nodes': dict(self.knowledge_graph.nodes(data=True)),
            'edges': {}
        }
        
        # Properly format edges with their data
        for u, v, data in self.knowledge_graph.edges(data=True):
            if u not in graph_data['edges']:
                graph_data['edges'][u] = {}
            graph_data['edges'][u][v] = data
        
        try:
            with open(filename, 'wb') as f:
                pickle.dump(graph_data, f)
            print(f"Knowledge graph saved to {filename}")
            print(f"Total nodes: {len(graph_data['nodes'])}")
            print(f"Total edges: {sum(len(edges) for edges in graph_data['edges'].values())}")
            return filename
        except Exception as e:
            print(f"Error saving knowledge graph: {str(e)}")
            return None
    
    def get_graph_statistics(self):
        """Get basic statistics about the knowledge graph"""
        stats = {
            'total_nodes': self.knowledge_graph.number_of_nodes(),
            'total_edges': self.knowledge_graph.number_of_edges(),
            'entity_types': {},
            'reliability_scores': {}
        }
        
        # Count entity types
        for node, attrs in self.knowledge_graph.nodes(data=True):
            entity_type = attrs.get('type', 'UNKNOWN')
            stats['entity_types'][entity_type] = stats['entity_types'].get(entity_type, 0) + 1
            
            # Calculate reliability score
            real_count = attrs.get('real_count', 0)
            fake_count = attrs.get('fake_count', 0)
            total = real_count + fake_count
            if total > 0:
                reliability = real_count / total
                stats['reliability_scores'][node] = reliability
        
        return stats

def main():
    # Initialize the knowledge graph builder
    builder = KnowledgeGraphBuilder()
    
    # Load your dataset
    df = pd.read_csv('./combined.csv')  # Replace with your actual data file
    
    # Create knowledge graph
    print("Building knowledge graph...")
    total_rows = len(df)
    for idx, row in df.iterrows():
        try:
            builder.update_knowledge_graph(row['text'], row['label'] == 'REAL')
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1}/{total_rows} entries ({(idx + 1)/total_rows*100:.1f}%)...")
        except Exception as e:
            print(f"Error processing row {idx}: {str(e)}")
            continue
    
    # Save the knowledge graph
    graph_path = builder.save_knowledge_graph()
    
    # Print statistics
    stats = builder.get_graph_statistics()
    print("\nKnowledge Graph Statistics:")
    print(f"Total nodes: {stats['total_nodes']}")
    print(f"Total edges: {stats['total_edges']}")
    print("\nEntity types distribution:")
    for entity_type, count in stats['entity_types'].items():
        print(f"{entity_type}: {count}")

if __name__ == "__main__":
    main()