File size: 8,368 Bytes
26f7059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee9fa1c
 
26f7059
ee9fa1c
26f7059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee9fa1c
26f7059
 
 
 
ee9fa1c
26f7059
 
 
 
 
 
 
 
 
ee9fa1c
26f7059
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee9fa1c
26f7059
 
ee9fa1c
26f7059
 
 
 
 
 
 
 
 
 
 
 
ee9fa1c
26f7059
 
 
 
 
 
 
 
 
 
ee9fa1c
26f7059
 
 
 
 
 
ee9fa1c
 
26f7059
 
ee9fa1c
 
 
26f7059
 
ee9fa1c
 
26f7059
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr  # Importing gradio for creating web interface
import os  # Importing os for operating system related tasks
import fitz  # Importing fitz for PDF processing
import re  # Importing re for regular expressions
import spacy  # Importing spacy for natural language processing
import spacy.cli  # Importing spacy's command line interface
import pandas as pd  # Importing pandas for data manipulation
import bs4  # Importing bs4 for web scraping
import requests  # Importing requests for making HTTP requests
from spacy import displacy  # Importing displacy from spacy for visualizing NLP results
from spacy.matcher import Matcher  # Importing Matcher from spacy for matching patterns
from spacy.tokens import Span  # Importing Span from spacy for handling spans of tokens
import networkx as nx  # Importing networkx for creating, manipulating, and studying graphs
import matplotlib.pyplot as plt  # Importing matplotlib.pyplot for data visualization
from tqdm import tqdm  # Importing tqdm for progress bars

nlp = spacy.load('en_core_web_sm')  # Loading the English model

try:
    nlp = spacy.load('en_core_web_sm')  # Trying to load the English model
except OSError:
    print("Model not found. Downloading...")  # If model not found, print a message
    spacy.cli.download("en_core_web_sm")  # Download the English model
    nlp = spacy.load('en_core_web_sm')  # Load the English model

def read_pdf(file):  # Define a function to read a PDF file
    doc = fitz.open(file)  # Open the PDF file
    text = []  # Initialize an empty list to store the text
    for page in doc:  # For each page in the document
        for sentence in page.get_text("text").split('\n'):  # For each sentence in the page
            if len(sentence) > 0:  # If the sentence is not empty
                text.append(sentence)  # Append the sentence to the list
    return pd.DataFrame(text, columns=["sentence"])  # Return a DataFrame of the sentences

def read_csv(file):  # Define a function to read a CSV file
    candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")  # Read the CSV file
    return candidate_sentences  # Return the DataFrame

def get_entities(sent):  # Define a function to get entities from a sentence
  ent1 = ""  # Initialize an empty string for the first entity
  ent2 = ""  # Initialize an empty string for the second entity

  prv_tok_dep = ""  # Initialize an empty string for the dependency tag of the previous token
  prv_tok_text = ""  # Initialize an empty string for the previous token

  prefix = ""  # Initialize an empty string for the prefix
  modifier = ""  # Initialize an empty string for the modifier

  for tok in nlp(sent):  # For each token in the sentence
    if tok.dep_ != "punct":  # If the token is not a punctuation mark
      if tok.dep_ == "compound":  # If the token is a compound word
        prefix = tok.text  # Set the prefix to the token text
        if prv_tok_dep == "compound":  # If the previous token was also a compound word
          prefix = prv_tok_text + " "+ tok.text  # Add the current word to the prefix
      
      if tok.dep_.endswith("mod") == True:  # If the token is a modifier
        modifier = tok.text  # Set the modifier to the token text
        if prv_tok_dep == "compound":  # If the previous token was also a compound word
          modifier = prv_tok_text + " "+ tok.text  # Add the current word to the modifier
      
      if tok.dep_.find("subj") == True:  # If the token is a subject
        ent1 = modifier +" "+ prefix + " "+ tok.text  # Set the first entity to the modifier, prefix, and token text
        prefix = ""  # Reset the prefix
        modifier = ""  # Reset the modifier
        prv_tok_dep = ""  # Reset the dependency tag of the previous token
        prv_tok_text = ""  # Reset the previous token

      if tok.dep_.find("obj") == True:  # If the token is an object
        ent2 = modifier +" "+ prefix +" "+ tok.text  # Set the second entity to the modifier, prefix, and token text
        
      prv_tok_dep = tok.dep_  # Update the dependency tag of the previous token
      prv_tok_text = tok.text  # Update the previous token

  return [ent1.strip(), ent2.strip()]  # Return the entities

def get_relation(sent):  # Define a function to get the relation from a sentence
    doc = nlp(sent)  # Process the sentence
    matcher = Matcher(nlp.vocab)  # Initialize a Matcher with the vocabulary
    pattern = [{'DEP':'ROOT'}, 
               {'DEP':'prep','OP':"?"},
               {'DEP':'agent','OP':"?"},  
               {'POS':'ADJ','OP':"?"}]  # Define a pattern
    matcher.add("matching_1", [pattern])  # Add the pattern to the matcher
    matches = matcher(doc)  # Match the pattern in the document
    if matches:  # If there are matches
        k = len(matches) - 1  # Get the index of the last match
        span = doc[matches[k][1]:matches[k][2]]  # Get the span of the match
        return span.text  # Return the text of the span
    else:
        return ""  # If there are no matches, return an empty string

def execute_process(file, edge):  # Define a function to execute the process
    candidate_sentences = read_pdf(file)  # Read the PDF file
    
    entity_pairs = []  # Initialize an empty list for the entity pairs
    for i in tqdm(candidate_sentences["sentence"]):  # For each sentence in the DataFrame
        entity_pairs.append(get_entities(i))  # Append the entities to the list
    relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]  # Get the relations for each sentence

    source = [i[0] for i in entity_pairs]  # Extract the subjects
    target = [i[1] for i in entity_pairs]  # Extract the objects
    kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})  # Create a DataFrame of the sources, targets, and edges

    unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None  # Get the unique edges
    edge_counts = kg_df['edge'].value_counts()  # Get the counts of the edges
    unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})  # Create a DataFrame of the unique edges and their counts
    
    if len(edge)==0:  # If no edge is specified
        G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                        edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame
        plt.figure(figsize=(12,12))  # Create a figure
        pos = nx.spring_layout(G)  # Get the positions of the nodes
        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
        plt.savefig("graph.png")  # Save the graph as a PNG
        return "graph.png", unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
      
    else:  # If an edge is specified
        G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target", 
                            edge_attr=True, create_using=nx.MultiDiGraph())  # Create a graph from the DataFrame with the specified edge
        plt.figure(figsize=(12,12))  # Create a figure
        pos = nx.spring_layout(G)  # Get the positions of the nodes
        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)  # Draw the graph
        plt.savefig("graph.png")  # Save the graph as a PNG
        return "graph.png", unique_edges_df  # Return the path to the PNG and the DataFrame of unique edges
    
inputs = [
    gr.File(label="Upload PDF"),  # Create a file input for uploading a PDF
    gr.Textbox(label="Graph a particular edge", type="text")  # Create a textbox input for specifying an edge
]

outputs = [
    gr.Image(label="Generated graph"),  # Create an image output for the generated graph
    gr.Dataframe(label="Unique edges", type="pandas")  # Create a DataFrame output for the unique edges
]

description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and matplotlib for graph generation, and fitz for PDF processing.'
iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)  # Create an interface
iface.launch()  # Launch the interface