File size: 8,368 Bytes
26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 ee9fa1c 26f7059 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr # Importing gradio for creating web interface
import os # Importing os for operating system related tasks
import fitz # Importing fitz for PDF processing
import re # Importing re for regular expressions
import spacy # Importing spacy for natural language processing
import spacy.cli # Importing spacy's command line interface
import pandas as pd # Importing pandas for data manipulation
import bs4 # Importing bs4 for web scraping
import requests # Importing requests for making HTTP requests
from spacy import displacy # Importing displacy from spacy for visualizing NLP results
from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns
from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens
import networkx as nx # Importing networkx for creating, manipulating, and studying graphs
import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization
from tqdm import tqdm # Importing tqdm for progress bars
nlp = spacy.load('en_core_web_sm') # Loading the English model
try:
nlp = spacy.load('en_core_web_sm') # Trying to load the English model
except OSError:
print("Model not found. Downloading...") # If model not found, print a message
spacy.cli.download("en_core_web_sm") # Download the English model
nlp = spacy.load('en_core_web_sm') # Load the English model
def read_pdf(file): # Define a function to read a PDF file
doc = fitz.open(file) # Open the PDF file
text = [] # Initialize an empty list to store the text
for page in doc: # For each page in the document
for sentence in page.get_text("text").split('\n'): # For each sentence in the page
if len(sentence) > 0: # If the sentence is not empty
text.append(sentence) # Append the sentence to the list
return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences
def read_csv(file): # Define a function to read a CSV file
candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file
return candidate_sentences # Return the DataFrame
def get_entities(sent): # Define a function to get entities from a sentence
ent1 = "" # Initialize an empty string for the first entity
ent2 = "" # Initialize an empty string for the second entity
prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token
prv_tok_text = "" # Initialize an empty string for the previous token
prefix = "" # Initialize an empty string for the prefix
modifier = "" # Initialize an empty string for the modifier
for tok in nlp(sent): # For each token in the sentence
if tok.dep_ != "punct": # If the token is not a punctuation mark
if tok.dep_ == "compound": # If the token is a compound word
prefix = tok.text # Set the prefix to the token text
if prv_tok_dep == "compound": # If the previous token was also a compound word
prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix
if tok.dep_.endswith("mod") == True: # If the token is a modifier
modifier = tok.text # Set the modifier to the token text
if prv_tok_dep == "compound": # If the previous token was also a compound word
modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier
if tok.dep_.find("subj") == True: # If the token is a subject
ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text
prefix = "" # Reset the prefix
modifier = "" # Reset the modifier
prv_tok_dep = "" # Reset the dependency tag of the previous token
prv_tok_text = "" # Reset the previous token
if tok.dep_.find("obj") == True: # If the token is an object
ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text
prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token
prv_tok_text = tok.text # Update the previous token
return [ent1.strip(), ent2.strip()] # Return the entities
def get_relation(sent): # Define a function to get the relation from a sentence
doc = nlp(sent) # Process the sentence
matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary
pattern = [{'DEP':'ROOT'},
{'DEP':'prep','OP':"?"},
{'DEP':'agent','OP':"?"},
{'POS':'ADJ','OP':"?"}] # Define a pattern
matcher.add("matching_1", [pattern]) # Add the pattern to the matcher
matches = matcher(doc) # Match the pattern in the document
if matches: # If there are matches
k = len(matches) - 1 # Get the index of the last match
span = doc[matches[k][1]:matches[k][2]] # Get the span of the match
return span.text # Return the text of the span
else:
return "" # If there are no matches, return an empty string
def execute_process(file, edge): # Define a function to execute the process
candidate_sentences = read_pdf(file) # Read the PDF file
entity_pairs = [] # Initialize an empty list for the entity pairs
for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame
entity_pairs.append(get_entities(i)) # Append the entities to the list
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence
source = [i[0] for i in entity_pairs] # Extract the subjects
target = [i[1] for i in entity_pairs] # Extract the objects
kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges
unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges
edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges
unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts
if len(edge)==0: # If no edge is specified
G=nx.from_pandas_edgelist(kg_df, "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame
plt.figure(figsize=(12,12)) # Create a figure
pos = nx.spring_layout(G) # Get the positions of the nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
plt.savefig("graph.png") # Save the graph as a PNG
return "graph.png", unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
else: # If an edge is specified
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge
plt.figure(figsize=(12,12)) # Create a figure
pos = nx.spring_layout(G) # Get the positions of the nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
plt.savefig("graph.png") # Save the graph as a PNG
return "graph.png", unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
inputs = [
gr.File(label="Upload PDF"), # Create a file input for uploading a PDF
gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge
]
outputs = [
gr.Image(label="Generated graph"), # Create an image output for the generated graph
gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges
]
description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and matplotlib for graph generation, and fitz for PDF processing.'
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description) # Create an interface
iface.launch() # Launch the interface
|