File size: 9,882 Bytes
02cc2d7 7b07841 399c03f 02cc2d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr # Importing gradio for creating web interface
import os # Importing os for operating system related tasks
import fitz # Importing fitz for PDF processing
import re # Importing re for regular expressions
import spacy # Importing spacy for natural language processing
import spacy.cli # Importing spacy's command line interface
import pandas as pd # Importing pandas for data manipulation
import bs4 # Importing bs4 for web scraping
import requests # Importing requests for making HTTP requests
from spacy import displacy # Importing displacy from spacy for visualizing NLP results
from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns
from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens
import networkx as nx # Importing networkx for creating, manipulating, and studying graphs
import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization
from tqdm import tqdm # Importing tqdm for progress bars
try:
nlp = spacy.load('en_core_web_sm') # Trying to load the English model
except OSError:
print("Model not found. Downloading...") # If model not found, print a message
spacy.cli.download("en_core_web_sm") # Download the English model
nlp = spacy.load('en_core_web_sm') # Load the English model
def read_pdf(file): # Define a function to read a PDF file
doc = fitz.open(file) # Open the PDF file
text = [] # Initialize an empty list to store the text
for page in doc: # For each page in the document
for sentence in page.get_text("text").split('\n'): # For each sentence in the page
if len(sentence) > 0: # If the sentence is not empty
text.append(sentence) # Append the sentence to the list
return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences
def read_csv(file): # Define a function to read a CSV file
candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file
return candidate_sentences # Return the DataFrame
def get_entities(sent): # Define a function to get entities from a sentence
ent1 = "" # Initialize an empty string for the first entity
ent2 = "" # Initialize an empty string for the second entity
prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token
prv_tok_text = "" # Initialize an empty string for the previous token
prefix = "" # Initialize an empty string for the prefix
modifier = "" # Initialize an empty string for the modifier
for tok in nlp(sent): # For each token in the sentence
if tok.dep_ != "punct": # If the token is not a punctuation mark
if tok.dep_ == "compound": # If the token is a compound word
prefix = tok.text # Set the prefix to the token text
if prv_tok_dep == "compound": # If the previous token was also a compound word
prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix
if tok.dep_.endswith("mod") == True: # If the token is a modifier
modifier = tok.text # Set the modifier to the token text
if prv_tok_dep == "compound": # If the previous token was also a compound word
modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier
if tok.dep_.find("subj") == True: # If the token is a subject
ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text
prefix = "" # Reset the prefix
modifier = "" # Reset the modifier
prv_tok_dep = "" # Reset the dependency tag of the previous token
prv_tok_text = "" # Reset the previous token
if tok.dep_.find("obj") == True: # If the token is an object
ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text
prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token
prv_tok_text = tok.text # Update the previous token
return [ent1.strip(), ent2.strip()] # Return the entities
def get_relation(sent): # Define a function to get the relation from a sentence
doc = nlp(sent) # Process the sentence
matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary
pattern = [{'DEP':'ROOT'},
{'DEP':'prep','OP':"?"},
{'DEP':'agent','OP':"?"},
{'POS':'ADJ','OP':"?"}] # Define a pattern
matcher.add("matching_1", [pattern]) # Add the pattern to the matcher
matches = matcher(doc) # Match the pattern in the document
if matches: # If there are matches
k = len(matches) - 1 # Get the index of the last match
span = doc[matches[k][1]:matches[k][2]] # Get the span of the match
return span.text # Return the text of the span
else:
return "" # If there are no matches, return an empty string
def render_graph(G):
from pyvis.network import Network
graph_output_directory = "./docs/index.html"
net = Network(
notebook=False,
# bgcolor="#1a1a1a",
cdn_resources="remote",
height="900px",
width="100%",
select_menu=True,
# font_color="#cccccc",
filter_menu=False,
)
net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])
net.show(graph_output_directory)
with open(graph_output_directory, 'r') as file:
html_content = file.read()
html_content = html_content.replace("'", "\"")
iframe = f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi; geolocation; microphone; camera;
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
allow-scripts allow-same-origin allow-popups
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
allowpaymentrequest="" frameborder="0" srcdoc='{html_content}'></iframe>"""
return iframe
def execute_process(file, edge): # Define a function to execute the process
candidate_sentences = read_pdf(file) # Read the PDF file
if 'kg_df' not in globals() or 'file' not in globals() or file != globals()['file']: # Only execute if kg_df is not defined or if the file is not consistent with the persisted global
entity_pairs = [] # Initialize an empty list for the entity pairs
for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame
entity_pairs.append(get_entities(i)) # Append the entities to the list
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence
source = [i[0] for i in entity_pairs] # Extract the subjects
target = [i[1] for i in entity_pairs] # Extract the objects
globals()['kg_df'] = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges
globals()['file'] = file # Persist the file into a global variable
unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges
edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts
if len(edge)==0: # If no edge is specified
G=nx.from_pandas_edgelist(kg_df, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame
# plt.figure(figsize=(12,12)) # Create a figure
# pos = nx.spring_layout(G) # Get the positions of the nodes
# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
# plt.savefig("graph.png") # Save the graph as a PNG
iframe = render_graph(G)
return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
else: # If an edge is specified
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge
# plt.figure(figsize=(12,12)) # Create a figure
# pos = nx.spring_layout(G) # Get the positions of the nodes
iframe = render_graph(G)
# nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
# plt.savefig("graph.png") # Save the graph as a PNG
return iframe, unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
inputs = [
gr.File(label="Upload PDF"), # Create a file input for uploading a PDF
gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge
]
outputs = [
gr.HTML(label="Generated graph"), # Create an image output for the generated graph
gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges
]
description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and pyvis for graph generation, and fitz for PDF processing.'
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF - NLP Knowledge graph - Interactive", description=description) # Create an interface
iface.launch() # Launch the interface
|