File size: 6,328 Bytes
990f77e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import pandas as pd
import networkx as nx
import spacy
import pickle
from datetime import datetime
import os
# Load spaCy for NER
nlp = spacy.load("en_core_web_sm")
class KnowledgeGraphBuilder:
def __init__(self, model_dir="models"):
self.model_dir = model_dir
self.knowledge_graph = nx.DiGraph()
def extract_entities(self, text):
"""Extract named entities from text using spaCy"""
try:
# Convert to string and handle NaN/None values
if pd.isna(text) or text is None:
return []
# Convert float or int to string if necessary
if isinstance(text, (float, int)):
text = str(text)
# Ensure text is a string
text = str(text).strip()
# Skip empty strings
if not text:
return []
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
return entities
except Exception as e:
print(f"Error processing text: {text}")
print(f"Error message: {str(e)}")
return []
def update_knowledge_graph(self, text, is_real):
"""Update knowledge graph with entities and their relationships"""
try:
entities = self.extract_entities(text)
# Skip if no entities were found
if not entities:
return
# Add nodes and edges to the graph
for entity, entity_type in entities:
# Add node if it doesn't exist
if not self.knowledge_graph.has_node(entity):
self.knowledge_graph.add_node(
entity,
type=entity_type,
real_count=1 if is_real else 0,
fake_count=0 if is_real else 1
)
else:
# Update counts
if is_real:
self.knowledge_graph.nodes[entity]['real_count'] += 1
else:
self.knowledge_graph.nodes[entity]['fake_count'] += 1
# Add edges between entities in the same text
for i, (entity1, _) in enumerate(entities):
for entity2, _ in entities[i+1:]:
if not self.knowledge_graph.has_edge(entity1, entity2):
self.knowledge_graph.add_edge(
entity1,
entity2,
weight=1,
is_real=is_real
)
else:
self.knowledge_graph[entity1][entity2]['weight'] += 1
except Exception as e:
print(f"Error updating knowledge graph: {str(e)}")
def save_knowledge_graph(self, filename=None):
"""Save the knowledge graph to a file"""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = os.path.join(self.model_dir, f"knowledge_graph_{timestamp}.pkl")
os.makedirs(self.model_dir, exist_ok=True)
# Convert the graph to a dictionary format for better serialization
graph_data = {
'nodes': dict(self.knowledge_graph.nodes(data=True)),
'edges': {}
}
# Properly format edges with their data
for u, v, data in self.knowledge_graph.edges(data=True):
if u not in graph_data['edges']:
graph_data['edges'][u] = {}
graph_data['edges'][u][v] = data
try:
with open(filename, 'wb') as f:
pickle.dump(graph_data, f)
print(f"Knowledge graph saved to {filename}")
print(f"Total nodes: {len(graph_data['nodes'])}")
print(f"Total edges: {sum(len(edges) for edges in graph_data['edges'].values())}")
return filename
except Exception as e:
print(f"Error saving knowledge graph: {str(e)}")
return None
def get_graph_statistics(self):
"""Get basic statistics about the knowledge graph"""
stats = {
'total_nodes': self.knowledge_graph.number_of_nodes(),
'total_edges': self.knowledge_graph.number_of_edges(),
'entity_types': {},
'reliability_scores': {}
}
# Count entity types
for node, attrs in self.knowledge_graph.nodes(data=True):
entity_type = attrs.get('type', 'UNKNOWN')
stats['entity_types'][entity_type] = stats['entity_types'].get(entity_type, 0) + 1
# Calculate reliability score
real_count = attrs.get('real_count', 0)
fake_count = attrs.get('fake_count', 0)
total = real_count + fake_count
if total > 0:
reliability = real_count / total
stats['reliability_scores'][node] = reliability
return stats
def main():
# Initialize the knowledge graph builder
builder = KnowledgeGraphBuilder()
# Load your dataset
df = pd.read_csv('./combined.csv') # Replace with your actual data file
# Create knowledge graph
print("Building knowledge graph...")
total_rows = len(df)
for idx, row in df.iterrows():
try:
builder.update_knowledge_graph(row['text'], row['label'] == 'REAL')
if (idx + 1) % 100 == 0:
print(f"Processed {idx + 1}/{total_rows} entries ({(idx + 1)/total_rows*100:.1f}%)...")
except Exception as e:
print(f"Error processing row {idx}: {str(e)}")
continue
# Save the knowledge graph
graph_path = builder.save_knowledge_graph()
# Print statistics
stats = builder.get_graph_statistics()
print("\nKnowledge Graph Statistics:")
print(f"Total nodes: {stats['total_nodes']}")
print(f"Total edges: {stats['total_edges']}")
print("\nEntity types distribution:")
for entity_type, count in stats['entity_types'].items():
print(f"{entity_type}: {count}")
if __name__ == "__main__":
main() |