import pandas as pd import networkx as nx import spacy import pickle from datetime import datetime import os # Load spaCy for NER nlp = spacy.load("en_core_web_sm") class KnowledgeGraphBuilder: def __init__(self, model_dir="models"): self.model_dir = model_dir self.knowledge_graph = nx.DiGraph() def extract_entities(self, text): """Extract named entities from text using spaCy""" try: # Convert to string and handle NaN/None values if pd.isna(text) or text is None: return [] # Convert float or int to string if necessary if isinstance(text, (float, int)): text = str(text) # Ensure text is a string text = str(text).strip() # Skip empty strings if not text: return [] doc = nlp(text) entities = [(ent.text, ent.label_) for ent in doc.ents] return entities except Exception as e: print(f"Error processing text: {text}") print(f"Error message: {str(e)}") return [] def update_knowledge_graph(self, text, is_real): """Update knowledge graph with entities and their relationships""" try: entities = self.extract_entities(text) # Skip if no entities were found if not entities: return # Add nodes and edges to the graph for entity, entity_type in entities: # Add node if it doesn't exist if not self.knowledge_graph.has_node(entity): self.knowledge_graph.add_node( entity, type=entity_type, real_count=1 if is_real else 0, fake_count=0 if is_real else 1 ) else: # Update counts if is_real: self.knowledge_graph.nodes[entity]['real_count'] += 1 else: self.knowledge_graph.nodes[entity]['fake_count'] += 1 # Add edges between entities in the same text for i, (entity1, _) in enumerate(entities): for entity2, _ in entities[i+1:]: if not self.knowledge_graph.has_edge(entity1, entity2): self.knowledge_graph.add_edge( entity1, entity2, weight=1, is_real=is_real ) else: self.knowledge_graph[entity1][entity2]['weight'] += 1 except Exception as e: print(f"Error updating knowledge graph: {str(e)}") def save_knowledge_graph(self, filename=None): """Save the knowledge graph to a file""" if filename is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = os.path.join(self.model_dir, f"knowledge_graph_{timestamp}.pkl") os.makedirs(self.model_dir, exist_ok=True) # Convert the graph to a dictionary format for better serialization graph_data = { 'nodes': dict(self.knowledge_graph.nodes(data=True)), 'edges': {} } # Properly format edges with their data for u, v, data in self.knowledge_graph.edges(data=True): if u not in graph_data['edges']: graph_data['edges'][u] = {} graph_data['edges'][u][v] = data try: with open(filename, 'wb') as f: pickle.dump(graph_data, f) print(f"Knowledge graph saved to {filename}") print(f"Total nodes: {len(graph_data['nodes'])}") print(f"Total edges: {sum(len(edges) for edges in graph_data['edges'].values())}") return filename except Exception as e: print(f"Error saving knowledge graph: {str(e)}") return None def get_graph_statistics(self): """Get basic statistics about the knowledge graph""" stats = { 'total_nodes': self.knowledge_graph.number_of_nodes(), 'total_edges': self.knowledge_graph.number_of_edges(), 'entity_types': {}, 'reliability_scores': {} } # Count entity types for node, attrs in self.knowledge_graph.nodes(data=True): entity_type = attrs.get('type', 'UNKNOWN') stats['entity_types'][entity_type] = stats['entity_types'].get(entity_type, 0) + 1 # Calculate reliability score real_count = attrs.get('real_count', 0) fake_count = attrs.get('fake_count', 0) total = real_count + fake_count if total > 0: reliability = real_count / total stats['reliability_scores'][node] = reliability return stats def main(): # Initialize the knowledge graph builder builder = KnowledgeGraphBuilder() # Load your dataset df = pd.read_csv('./combined.csv') # Replace with your actual data file # Create knowledge graph print("Building knowledge graph...") total_rows = len(df) for idx, row in df.iterrows(): try: builder.update_knowledge_graph(row['text'], row['label'] == 'REAL') if (idx + 1) % 100 == 0: print(f"Processed {idx + 1}/{total_rows} entries ({(idx + 1)/total_rows*100:.1f}%)...") except Exception as e: print(f"Error processing row {idx}: {str(e)}") continue # Save the knowledge graph graph_path = builder.save_knowledge_graph() # Print statistics stats = builder.get_graph_statistics() print("\nKnowledge Graph Statistics:") print(f"Total nodes: {stats['total_nodes']}") print(f"Total edges: {stats['total_edges']}") print("\nEntity types distribution:") for entity_type, count in stats['entity_types'].items(): print(f"{entity_type}: {count}") if __name__ == "__main__": main()