Chris Finlayson commited on
Commit
26f7059
·
1 Parent(s): ee9fa1c

Functional app

Browse files
Files changed (2) hide show
  1. app.py +121 -155
  2. graph.png +0 -0
app.py CHANGED
@@ -1,172 +1,138 @@
1
- import gradio as gr
2
- import os
3
- import fitz
4
- import re
5
- import spacy
6
- import spacy.cli
7
- import re
8
- import pandas as pd
9
- import bs4
10
- import requests
11
- import spacy
12
- from spacy import displacy
13
- nlp = spacy.load('en_core_web_sm')
14
- from spacy.matcher import Matcher
15
- from spacy.tokens import Span
16
- import networkx as nx
17
- import matplotlib.pyplot as plt
18
- from tqdm import tqdm
19
 
20
  try:
21
- nlp = spacy.load('en_core_web_sm')
22
  except OSError:
23
- print("Model not found. Downloading...")
24
- spacy.cli.download("en_core_web_sm")
25
- nlp = spacy.load('en_core_web_sm')
26
-
27
-
28
- # def read_pdf(file):
29
- # doc = fitz.open(file)
30
- # text = ""
31
- # for page in doc:
32
- # text += page.get_text("text").split('\n')
33
- # return text
34
-
35
- def read_csv(file):
36
- candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")
37
- return candidate_sentences.shape
38
-
39
- def get_entities(sent):
40
- ## chunk 1
41
- ent1 = ""
42
- ent2 = ""
43
-
44
- prv_tok_dep = "" # dependency tag of previous token in the sentence
45
- prv_tok_text = "" # previous token in the sentence
46
-
47
- prefix = ""
48
- modifier = ""
49
-
50
- #############################################################
51
-
52
- for tok in nlp(sent):
53
- ## chunk 2
54
- # if token is a punctuation mark then move on to the next token
55
- if tok.dep_ != "punct":
56
- # check: token is a compound word or not
57
- if tok.dep_ == "compound":
58
- prefix = tok.text
59
- # if the previous word was also a 'compound' then add the current word to it
60
- if prv_tok_dep == "compound":
61
- prefix = prv_tok_text + " "+ tok.text
62
 
63
- # check: token is a modifier or not
64
- if tok.dep_.endswith("mod") == True:
65
- modifier = tok.text
66
- # if the previous word was also a 'compound' then add the current word to it
67
- if prv_tok_dep == "compound":
68
- modifier = prv_tok_text + " "+ tok.text
69
 
70
- ## chunk 3
71
- if tok.dep_.find("subj") == True:
72
- ent1 = modifier +" "+ prefix + " "+ tok.text
73
- prefix = ""
74
- modifier = ""
75
- prv_tok_dep = ""
76
- prv_tok_text = ""
77
-
78
- ## chunk 4
79
- if tok.dep_.find("obj") == True:
80
- ent2 = modifier +" "+ prefix +" "+ tok.text
81
 
82
- ## chunk 5
83
- # update variables
84
- prv_tok_dep = tok.dep_
85
- prv_tok_text = tok.text
86
- #############################################################
87
-
88
- return [ent1.strip(), ent2.strip()]
89
-
90
- def get_relation(sent):
91
-
92
- doc = nlp(sent)
93
-
94
- # Matcher class object
95
- matcher = Matcher(nlp.vocab)
96
-
97
- #define the pattern
98
- pattern = [{'DEP':'ROOT'},
99
- {'DEP':'prep','OP':"?"},
100
- {'DEP':'agent','OP':"?"},
101
- {'POS':'ADJ','OP':"?"}]
102
-
103
- matcher.add("matching_1", [pattern])
104
-
105
- matches = matcher(doc)
106
- k = len(matches) - 1
107
-
108
- span = doc[matches[k][1]:matches[k][2]]
109
-
110
- return(span.text)
111
-
112
- def ulify(elements):
113
- string = "<ul>\n"
114
- string += "\n".join(["<li>" + str(s) + "</li>" for s in elements])
115
- string += "\n</ul>"
116
- return string
117
-
118
- def execute_process(file, edge):
119
- # candidate_sentences = pd.DataFrame(read_pdf(file), columns=['Sentences'])
120
- candidate_sentences = pd.read_csv(file)
121
-
122
- entity_pairs = []
123
- for i in tqdm(candidate_sentences["sentence"]):
124
- entity_pairs.append(get_entities(i))
125
- relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]
126
- # extract subject
127
- source = [i[0] for i in entity_pairs]
128
-
129
- # extract object
130
- target = [i[1] for i in entity_pairs]
131
- kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
132
 
133
- # create a variable of all unique edges
134
- unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None
135
- # create a dataframe of all unique edges and their counts
136
- edge_counts = kg_df['edge'].value_counts()
137
- unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})
138
 
139
- G=nx.from_pandas_edgelist(kg_df, "source", "target",
140
- edge_attr=True, create_using=nx.MultiDiGraph())
 
 
 
 
 
 
 
 
 
 
141
 
142
- if edge is not None:
 
 
 
 
 
 
 
 
 
143
  G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
144
- edge_attr=True, create_using=nx.MultiDiGraph())
145
- plt.figure(figsize=(12,12))
146
- pos = nx.spring_layout(G)
147
- nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
148
- plt.savefig("graph.png")
149
- # return "graph.png", "\n".join(unique_edges)
150
- return "graph.png", unique_edges_df
151
-
152
- else:
153
- plt.figure(figsize=(12,12))
154
- pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
155
- nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
156
- plt.savefig("graph.png")
157
- # return "graph.png", "\n".join(unique_edges)
158
- return "graph.png", unique_edges_df
159
 
160
  inputs = [
161
- gr.File(label="Upload PDF"),
162
- gr.Textbox(label="Graph a particular edge", type="text")
163
  ]
164
 
165
  outputs = [
166
- gr.Image(label="Generated graph"),
167
- gr.Dataframe(label="Unique edges", type="pandas")
168
  ]
169
 
170
- description = 'This app reads all text from a PDF document, and allows the user to generate a knowledge which illustrates concepts and relationships within'
171
- iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)
172
- iface.launch()
 
1
+ import gradio as gr # Importing gradio for creating web interface
2
+ import os # Importing os for operating system related tasks
3
+ import fitz # Importing fitz for PDF processing
4
+ import re # Importing re for regular expressions
5
+ import spacy # Importing spacy for natural language processing
6
+ import spacy.cli # Importing spacy's command line interface
7
+ import pandas as pd # Importing pandas for data manipulation
8
+ import bs4 # Importing bs4 for web scraping
9
+ import requests # Importing requests for making HTTP requests
10
+ from spacy import displacy # Importing displacy from spacy for visualizing NLP results
11
+ from spacy.matcher import Matcher # Importing Matcher from spacy for matching patterns
12
+ from spacy.tokens import Span # Importing Span from spacy for handling spans of tokens
13
+ import networkx as nx # Importing networkx for creating, manipulating, and studying graphs
14
+ import matplotlib.pyplot as plt # Importing matplotlib.pyplot for data visualization
15
+ from tqdm import tqdm # Importing tqdm for progress bars
16
+
17
+ nlp = spacy.load('en_core_web_sm') # Loading the English model
 
18
 
19
  try:
20
+ nlp = spacy.load('en_core_web_sm') # Trying to load the English model
21
  except OSError:
22
+ print("Model not found. Downloading...") # If model not found, print a message
23
+ spacy.cli.download("en_core_web_sm") # Download the English model
24
+ nlp = spacy.load('en_core_web_sm') # Load the English model
25
+
26
+ def read_pdf(file): # Define a function to read a PDF file
27
+ doc = fitz.open(file) # Open the PDF file
28
+ text = [] # Initialize an empty list to store the text
29
+ for page in doc: # For each page in the document
30
+ for sentence in page.get_text("text").split('\n'): # For each sentence in the page
31
+ if len(sentence) > 0: # If the sentence is not empty
32
+ text.append(sentence) # Append the sentence to the list
33
+ return pd.DataFrame(text, columns=["sentence"]) # Return a DataFrame of the sentences
34
+
35
+ def read_csv(file): # Define a function to read a CSV file
36
+ candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv") # Read the CSV file
37
+ return candidate_sentences # Return the DataFrame
38
+
39
+ def get_entities(sent): # Define a function to get entities from a sentence
40
+ ent1 = "" # Initialize an empty string for the first entity
41
+ ent2 = "" # Initialize an empty string for the second entity
42
+
43
+ prv_tok_dep = "" # Initialize an empty string for the dependency tag of the previous token
44
+ prv_tok_text = "" # Initialize an empty string for the previous token
45
+
46
+ prefix = "" # Initialize an empty string for the prefix
47
+ modifier = "" # Initialize an empty string for the modifier
48
+
49
+ for tok in nlp(sent): # For each token in the sentence
50
+ if tok.dep_ != "punct": # If the token is not a punctuation mark
51
+ if tok.dep_ == "compound": # If the token is a compound word
52
+ prefix = tok.text # Set the prefix to the token text
53
+ if prv_tok_dep == "compound": # If the previous token was also a compound word
54
+ prefix = prv_tok_text + " "+ tok.text # Add the current word to the prefix
 
 
 
 
 
 
55
 
56
+ if tok.dep_.endswith("mod") == True: # If the token is a modifier
57
+ modifier = tok.text # Set the modifier to the token text
58
+ if prv_tok_dep == "compound": # If the previous token was also a compound word
59
+ modifier = prv_tok_text + " "+ tok.text # Add the current word to the modifier
 
 
60
 
61
+ if tok.dep_.find("subj") == True: # If the token is a subject
62
+ ent1 = modifier +" "+ prefix + " "+ tok.text # Set the first entity to the modifier, prefix, and token text
63
+ prefix = "" # Reset the prefix
64
+ modifier = "" # Reset the modifier
65
+ prv_tok_dep = "" # Reset the dependency tag of the previous token
66
+ prv_tok_text = "" # Reset the previous token
67
+
68
+ if tok.dep_.find("obj") == True: # If the token is an object
69
+ ent2 = modifier +" "+ prefix +" "+ tok.text # Set the second entity to the modifier, prefix, and token text
 
 
70
 
71
+ prv_tok_dep = tok.dep_ # Update the dependency tag of the previous token
72
+ prv_tok_text = tok.text # Update the previous token
73
+
74
+ return [ent1.strip(), ent2.strip()] # Return the entities
75
+
76
+ def get_relation(sent): # Define a function to get the relation from a sentence
77
+ doc = nlp(sent) # Process the sentence
78
+ matcher = Matcher(nlp.vocab) # Initialize a Matcher with the vocabulary
79
+ pattern = [{'DEP':'ROOT'},
80
+ {'DEP':'prep','OP':"?"},
81
+ {'DEP':'agent','OP':"?"},
82
+ {'POS':'ADJ','OP':"?"}] # Define a pattern
83
+ matcher.add("matching_1", [pattern]) # Add the pattern to the matcher
84
+ matches = matcher(doc) # Match the pattern in the document
85
+ if matches: # If there are matches
86
+ k = len(matches) - 1 # Get the index of the last match
87
+ span = doc[matches[k][1]:matches[k][2]] # Get the span of the match
88
+ return span.text # Return the text of the span
89
+ else:
90
+ return "" # If there are no matches, return an empty string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ def execute_process(file, edge): # Define a function to execute the process
93
+ candidate_sentences = read_pdf(file) # Read the PDF file
 
 
 
94
 
95
+ entity_pairs = [] # Initialize an empty list for the entity pairs
96
+ for i in tqdm(candidate_sentences["sentence"]): # For each sentence in the DataFrame
97
+ entity_pairs.append(get_entities(i)) # Append the entities to the list
98
+ relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])] # Get the relations for each sentence
99
+
100
+ source = [i[0] for i in entity_pairs] # Extract the subjects
101
+ target = [i[1] for i in entity_pairs] # Extract the objects
102
+ kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations}) # Create a DataFrame of the sources, targets, and edges
103
+
104
+ unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None # Get the unique edges
105
+ edge_counts = kg_df['edge'].value_counts() # Get the counts of the edges
106
+ unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values}) # Create a DataFrame of the unique edges and their counts
107
 
108
+ if len(edge)==0: # If no edge is specified
109
+ G=nx.from_pandas_edgelist(kg_df, "source", "target",
110
+ edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame
111
+ plt.figure(figsize=(12,12)) # Create a figure
112
+ pos = nx.spring_layout(G) # Get the positions of the nodes
113
+ nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
114
+ plt.savefig("graph.png") # Save the graph as a PNG
115
+ return "graph.png", unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
116
+
117
+ else: # If an edge is specified
118
  G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
119
+ edge_attr=True, create_using=nx.MultiDiGraph()) # Create a graph from the DataFrame with the specified edge
120
+ plt.figure(figsize=(12,12)) # Create a figure
121
+ pos = nx.spring_layout(G) # Get the positions of the nodes
122
+ nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos) # Draw the graph
123
+ plt.savefig("graph.png") # Save the graph as a PNG
124
+ return "graph.png", unique_edges_df # Return the path to the PNG and the DataFrame of unique edges
 
 
 
 
 
 
 
 
 
125
 
126
  inputs = [
127
+ gr.File(label="Upload PDF"), # Create a file input for uploading a PDF
128
+ gr.Textbox(label="Graph a particular edge", type="text") # Create a textbox input for specifying an edge
129
  ]
130
 
131
  outputs = [
132
+ gr.Image(label="Generated graph"), # Create an image output for the generated graph
133
+ gr.Dataframe(label="Unique edges", type="pandas") # Create a DataFrame output for the unique edges
134
  ]
135
 
136
+ description = 'This Python script generates a knowledge graph from a PDF document. It uses several libraries including gradio for the web interface, spacy for natural language processing, networkx and matplotlib for graph generation, and fitz for PDF processing.'
137
+ iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description) # Create an interface
138
+ iface.launch() # Launch the interface
graph.png CHANGED