|
import pandas as pd |
|
import networkx as nx |
|
import spacy |
|
import pickle |
|
from datetime import datetime |
|
import os |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
class KnowledgeGraphBuilder: |
|
def __init__(self, model_dir="models"): |
|
self.model_dir = model_dir |
|
self.knowledge_graph = nx.DiGraph() |
|
|
|
def extract_entities(self, text): |
|
"""Extract named entities from text using spaCy""" |
|
try: |
|
|
|
if pd.isna(text) or text is None: |
|
return [] |
|
|
|
|
|
if isinstance(text, (float, int)): |
|
text = str(text) |
|
|
|
|
|
text = str(text).strip() |
|
|
|
|
|
if not text: |
|
return [] |
|
|
|
doc = nlp(text) |
|
entities = [(ent.text, ent.label_) for ent in doc.ents] |
|
return entities |
|
except Exception as e: |
|
print(f"Error processing text: {text}") |
|
print(f"Error message: {str(e)}") |
|
return [] |
|
|
|
def update_knowledge_graph(self, text, is_real): |
|
"""Update knowledge graph with entities and their relationships""" |
|
try: |
|
entities = self.extract_entities(text) |
|
|
|
|
|
if not entities: |
|
return |
|
|
|
|
|
for entity, entity_type in entities: |
|
|
|
if not self.knowledge_graph.has_node(entity): |
|
self.knowledge_graph.add_node( |
|
entity, |
|
type=entity_type, |
|
real_count=1 if is_real else 0, |
|
fake_count=0 if is_real else 1 |
|
) |
|
else: |
|
|
|
if is_real: |
|
self.knowledge_graph.nodes[entity]['real_count'] += 1 |
|
else: |
|
self.knowledge_graph.nodes[entity]['fake_count'] += 1 |
|
|
|
|
|
for i, (entity1, _) in enumerate(entities): |
|
for entity2, _ in entities[i+1:]: |
|
if not self.knowledge_graph.has_edge(entity1, entity2): |
|
self.knowledge_graph.add_edge( |
|
entity1, |
|
entity2, |
|
weight=1, |
|
is_real=is_real |
|
) |
|
else: |
|
self.knowledge_graph[entity1][entity2]['weight'] += 1 |
|
except Exception as e: |
|
print(f"Error updating knowledge graph: {str(e)}") |
|
|
|
def save_knowledge_graph(self, filename=None): |
|
"""Save the knowledge graph to a file""" |
|
if filename is None: |
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
filename = os.path.join(self.model_dir, f"knowledge_graph_{timestamp}.pkl") |
|
|
|
os.makedirs(self.model_dir, exist_ok=True) |
|
|
|
|
|
graph_data = { |
|
'nodes': dict(self.knowledge_graph.nodes(data=True)), |
|
'edges': {} |
|
} |
|
|
|
|
|
for u, v, data in self.knowledge_graph.edges(data=True): |
|
if u not in graph_data['edges']: |
|
graph_data['edges'][u] = {} |
|
graph_data['edges'][u][v] = data |
|
|
|
try: |
|
with open(filename, 'wb') as f: |
|
pickle.dump(graph_data, f) |
|
print(f"Knowledge graph saved to {filename}") |
|
print(f"Total nodes: {len(graph_data['nodes'])}") |
|
print(f"Total edges: {sum(len(edges) for edges in graph_data['edges'].values())}") |
|
return filename |
|
except Exception as e: |
|
print(f"Error saving knowledge graph: {str(e)}") |
|
return None |
|
|
|
def get_graph_statistics(self): |
|
"""Get basic statistics about the knowledge graph""" |
|
stats = { |
|
'total_nodes': self.knowledge_graph.number_of_nodes(), |
|
'total_edges': self.knowledge_graph.number_of_edges(), |
|
'entity_types': {}, |
|
'reliability_scores': {} |
|
} |
|
|
|
|
|
for node, attrs in self.knowledge_graph.nodes(data=True): |
|
entity_type = attrs.get('type', 'UNKNOWN') |
|
stats['entity_types'][entity_type] = stats['entity_types'].get(entity_type, 0) + 1 |
|
|
|
|
|
real_count = attrs.get('real_count', 0) |
|
fake_count = attrs.get('fake_count', 0) |
|
total = real_count + fake_count |
|
if total > 0: |
|
reliability = real_count / total |
|
stats['reliability_scores'][node] = reliability |
|
|
|
return stats |
|
|
|
def main(): |
|
|
|
builder = KnowledgeGraphBuilder() |
|
|
|
|
|
df = pd.read_csv('./combined.csv') |
|
|
|
|
|
print("Building knowledge graph...") |
|
total_rows = len(df) |
|
for idx, row in df.iterrows(): |
|
try: |
|
builder.update_knowledge_graph(row['text'], row['label'] == 'REAL') |
|
if (idx + 1) % 100 == 0: |
|
print(f"Processed {idx + 1}/{total_rows} entries ({(idx + 1)/total_rows*100:.1f}%)...") |
|
except Exception as e: |
|
print(f"Error processing row {idx}: {str(e)}") |
|
continue |
|
|
|
|
|
graph_path = builder.save_knowledge_graph() |
|
|
|
|
|
stats = builder.get_graph_statistics() |
|
print("\nKnowledge Graph Statistics:") |
|
print(f"Total nodes: {stats['total_nodes']}") |
|
print(f"Total edges: {stats['total_edges']}") |
|
print("\nEntity types distribution:") |
|
for entity_type, count in stats['entity_types'].items(): |
|
print(f"{entity_type}: {count}") |
|
|
|
if __name__ == "__main__": |
|
main() |