import gradio as gr # Importing gradio for creating web interface | |
import os # Importing os for operating system related tasks | |
import fitz # Importing fitz for PDF processing | |
import re # Importing re for regular expressions | |
import spacy # Importing spacy for natural language processing | |
import spacy.cli # Importing spacy's command line interface | |
import pandas as pd # Importing pandas for data manipulation | |
import bs4 # Importing bs4 for web scraping | |
import requests # Importing requests for making HTTP requests | |
from spacy import displacy # Importing displacy from spacy for visualizing NLP results | |
from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns | |
from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens | |
import networkx as nx # Importing networkx for creating, manipulating, and studying graphs | |
import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization | |
from tqdm import tqdm # Importing tqdm for progress bars | |
try: | |
nlp = spacy.load('en_core_web_sm') # Trying to load the English model | |
except OSError: | |
print("Model not found. Downloading...") # If model not found, print a message | |
spacy.cli.download("en_core_web_sm") # Download the English model | |
nlp = spacy.load('en_core_web_sm') # Load the English model | |
def read_pdf(file): # Define a function to read a PDF file | |
doc = fitz.open(file) # Open the PDF file | |
text = [] # Initialize an empty list to store the text | |
for page in doc: # For each page in the document | |
for sentence in page.get_text("text").split('\n'): # For each sentence in the page | |
if len(sentence) > 0: # If the sentence is not empty | |
text.append(sentence) # Append the sentence to the list | |
return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences | |
def read_csv(file): # Define a function to read a CSV file | |
candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file | |
return candidate_sentences # Return the DataFrame | |
def get_entities(sent): # Define a function to get entities from a sentence | |
ent1 = "" # Initialize an empty string for the first entity | |
ent2 = "" # Initialize an empty string for the second entity | |
prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token | |
prv_tok_text = "" # Initialize an empty string for the previous token | |
prefix = "" # Initialize an empty string for the prefix | |
modifier = "" # Initialize an empty string for the modifier | |
for tok in nlp(sent): # For each token in the sentence | |
if tok.dep_ != "punct": # If the token is not a punctuation mark | |
if tok.dep_ == "compound": # If the token is a compound word | |
prefix = tok.text # Set the prefix to the token text | |
if prv_tok_dep == "compound": # If the previous token was also a compound word | |
prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix | |
if tok.dep_.endswith("mod") == True: # If the token is a modifier | |
modifier = tok.text # Set the modifier to the token text | |
if prv_tok_dep == "compound": # If the previous token was also a compound word | |
modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier | |
if tok.dep_.find("subj") == True: # If the token is a subject | |
ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text | |
prefix = "" # Reset the prefix | |
modifier = "" # Reset the modifier | |
prv_tok_dep = "" # Reset the dependency tag of the previous token | |
prv_tok_text = "" # Reset the previous token | |
if tok.dep_.find("obj") == True: # If the token is an object | |
ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text | |
prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token | |
prv_tok_text = tok.text # Update the previous token | |
return [ent1.strip(), ent2.strip()] # Return the entities | |
def get_relation(sent): # Define a function to get the relation from a sentence | |
doc = nlp(sent) # Process the sentence | |
matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary | |
pattern = [{'DEP':'ROOT'}, | |
{'DEP':'prep','OP':"?"}, | |
{'DEP':'agent','OP':"?"}, | |
{'POS':'ADJ','OP':"?"}] # Define a pattern | |
matcher.add("matching_1", [pattern]) # Add the pattern to the matcher | |
matches = matcher(doc) # Match the pattern in the document | |
if matches: # If there are matches | |
k = len(matches) - 1 # Get the index of the last match | |
span = doc[matches[k][1]:matches[k][2]] # Get the span of the match | |
return span.text # Return the text of the span | |
else: | |
return "" # If there are no matches, return an empty string | |
def render_graph(G): | |
from pyvis.network import Network | |
graph_output_directory = "./docs/index.html" | |
net = Network( | |
notebook=False, | |
# bgcolor="#1a1a1a", | |
cdn_resources="remote", | |
height="900px", | |
width="100%", | |
select_menu=True, | |
# font_color="#cccccc", | |
filter_menu=False, | |
) | |
net.from_nx(G) | |
# net.repulsion(node_distance=150, spring_length=400) | |
net.force_atlas_2based(central_gravity=0.015, gravity=-31) | |
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380) | |
net.show_buttons(filter_=["physics"]) | |
net.show(graph_output_directory) | |
with open(graph_output_directory, 'r') as file: | |
html_content = file.read() | |
html_content = html_content.replace("'", "\"") | |
iframe = f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi; geolocation; microphone; camera; | |
display-capture; encrypted-media;" sandbox="allow-modals allow-forms | |
allow-scripts allow-same-origin allow-popups | |
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" | |
allowpaymentrequest="" frameborder="0" srcdoc='{html_content}'></iframe>""" | |
return iframe | |
def execute_process(file, edge): # Define a function to execute the process | |
candidate_sentences = read_pdf(file) # Read the PDF file | |
if 'kg_df' not in globals() or 'file' not in globals() or file != globals()['file']: # Only execute if kg_df is not defined or if the file is not consistent with the persisted global | |
entity_pairs = [] # Initialize an empty list for the entity pairs | |
for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame | |
entity_pairs.append(get_entities(i)) # Append the entities to the list | |
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence | |
source = [i[0] for i in entity_pairs] # Extract the subjects | |
target = [i[1] for i in entity_pairs] # Extract the objects | |
globals()['kg_df'] = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges | |
globals()['file'] = file # Persist the file into a global variable | |
unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges | |
edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges | |
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts | |
if len(edge)==0: # If no edge is specified | |
G=nx.from_pandas_edgelist(kg_df, "source", "target", | |
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame | |
# plt.figure(figsize=(12,12)) # Create a figure | |
# pos = nx.spring_layout(G) # Get the positions of the nodes | |
# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph | |
# plt.savefig("graph.png") # Save the graph as a PNG | |
iframe = render_graph(G) | |
return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges | |
else: # If an edge is specified | |
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", | |
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge | |
# plt.figure(figsize=(12,12)) # Create a figure | |
# pos = nx.spring_layout(G) # Get the positions of the nodes | |
iframe = render_graph(G) | |
# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph | |
# plt.savefig("graph.png") # Save the graph as a PNG | |
return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges | |
inputs = [ | |
gr.File(label="Upload PDF"), # Create a file input for uploading a PDF | |
gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge | |
] | |
outputs = [ | |
gr.HTML(label="Generated graph"), # Create an image output for the generated graph | |
gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges | |
] | |
description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and pyvis for graph generation, and fitz for PDF processing.' | |
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF - NLP Knowledge graph - Interactive", description=description) # Create an interface | |
iface.launch() # Launch the interface | |