johaunh commited on
Commit
3273f67
Β·
1 Parent(s): 34ecdc2

Add streamlit app

Browse files
config/validation.yml CHANGED
@@ -6,19 +6,19 @@ instruction: >
6
  an object entity o (format: [s, r, o]). We emphasize that the order of these
7
  components is significant; the subject s relates to the object o via the
8
  relation r. That is, the relation points from the subject to the object. Our
9
- AI agent extracts a collection of triplets from each passage provided. The
10
- evaluation task has 5 subtasks:
11
  questions:
12
  Q1:
13
  title: Specificity (subject entity).
14
- text: Does the subject entity represent a specific term/concept referenced in the passage?
15
  additional:
16
  options:
17
  - 1 = Specific
18
  - 0 = Not specific
19
  Q2:
20
  title: Specificity (object entity).
21
- text: Does the object entity represent a specific term/concept referenced in the passage?
22
  additional:
23
  options:
24
  - 1 = Specific
@@ -54,8 +54,8 @@ example:
54
  system).
55
  triplet: "['organ', 'such as', 'heart']"
56
  answers:
57
- Q1: 1 - The entity 'organ' is specific and is mentioned in the passage.
58
- Q2: 1 - The entity 'heart' is specific and is mentioned in the passage.
59
- Q3: 1 - The relation 'such as' is unclear. A better relation would be 'superclass of'.
60
  Q4: 1 - The triplet is relatively important as it is used as a parenthetical example in the passage.
61
- Q5: 0 - The provided triplet is unclear as is. "Organ such as heart" doesn't make sense.
 
6
  an object entity o (format: [s, r, o]). We emphasize that the order of these
7
  components is significant; the subject s relates to the object o via the
8
  relation r. That is, the relation points from the subject to the object. Our
9
+ AI agent extracts a collection of triplets from each passage provided. Please
10
+ answer the following questions for each triplet provided.
11
  questions:
12
  Q1:
13
  title: Specificity (subject entity).
14
+ text: Is the subject entity a suitable glossary term for a textbook?
15
  additional:
16
  options:
17
  - 1 = Specific
18
  - 0 = Not specific
19
  Q2:
20
  title: Specificity (object entity).
21
+ text: Is the object entity a suitable glossary term for a textbook?
22
  additional:
23
  options:
24
  - 1 = Specific
 
54
  system).
55
  triplet: "['organ', 'such as', 'heart']"
56
  answers:
57
+ Q1: 1 - The entity 'organ' is specific and would likely appear in a textbook glossary.
58
+ Q2: 1 - The entity 'heart' is specific and would likely appear in a textbook glossary.
59
+ Q3: 0 - The relation 'such as' is unclear. A better relation would be 'superclass of'.
60
  Q4: 1 - The triplet is relatively important as it is used as a parenthetical example in the passage.
61
+ Q5: 0 - The provided triplet is unclear as is. "Organ such as heart" does not make sense.
src/words2wisdom/cli.py CHANGED
@@ -16,21 +16,25 @@ default_config_path = os.path.join(CONFIG_DIR, "default_config.ini")
16
  def main():
17
  parser = argparse.ArgumentParser(
18
  prog="words2wisdom",
19
- #description="Generate a knowledge graph from a given text using OpenAI LLMs"
20
  )
21
  subparsers = parser.add_subparsers(dest="command",
22
  help="Available commands")
23
 
24
  # init
25
  parser_init = subparsers.add_parser("init",
26
- help="Return the default config.ini file",
27
- description="Return the default config.ini file")
 
28
  parser_init.set_defaults(func=get_default_config)
29
 
30
  # gui
31
  parser_gui = subparsers.add_parser("gui",
32
- help="run Words2Wisdom using Gradio interface",
33
- description="run Words2Wisdom using Gradio interface")
 
 
 
34
  parser_gui.set_defaults(func=gui)
35
 
36
  # run
@@ -39,10 +43,11 @@ def main():
39
  description="Generate a knowledge graph from a given text using OpenAI LLMs")
40
  parser_run.add_argument("text",
41
  help="Path to text file")
42
- parser_run.add_argument("--config",
43
  help="Path to config.ini file",
44
  default=default_config_path)
45
- parser_run.add_argument("--output-dir",
 
46
  help="Path to save outputs to",
47
  default=OUTPUT_DIR)
48
  parser_run.set_defaults(func=run)
@@ -68,8 +73,13 @@ def get_default_config(args):
68
 
69
 
70
  def gui(args):
71
- """Run Gradio interface"""
72
- subprocess.run(["python", "-m", "text2kg.gui"])
 
 
 
 
 
73
 
74
 
75
  def run(args):
 
16
  def main():
17
  parser = argparse.ArgumentParser(
18
  prog="words2wisdom",
19
+ description="Knowledge graph generation utilities using OpenAI LLMs"
20
  )
21
  subparsers = parser.add_subparsers(dest="command",
22
  help="Available commands")
23
 
24
  # init
25
  parser_init = subparsers.add_parser("init",
26
+ usage="words2wisdom init [> PATH/TO/WRITE/CONFIG.INI]",
27
+ help="Initialize a template config.ini file",
28
+ description="Initialize a template config.ini file. Redirect to a new file using the '>' symbol.")
29
  parser_init.set_defaults(func=get_default_config)
30
 
31
  # gui
32
  parser_gui = subparsers.add_parser("gui",
33
+ help="Use Words2Wisdom via Gradio interface",
34
+ description="use Words2Wisdom using Gradio interface")
35
+ parser_gui.add_argument("-s", "--streamlit",
36
+ action="store_true",
37
+ help="Use Streamlit GUI instead of Gradio GUI")
38
  parser_gui.set_defaults(func=gui)
39
 
40
  # run
 
43
  description="Generate a knowledge graph from a given text using OpenAI LLMs")
44
  parser_run.add_argument("text",
45
  help="Path to text file")
46
+ parser_run.add_argument("-c", "--config",
47
  help="Path to config.ini file",
48
  default=default_config_path)
49
+ parser_run.add_argument("-o", "--output-dir",
50
+ metavar="OUTPUT_PATH",
51
  help="Path to save outputs to",
52
  default=OUTPUT_DIR)
53
  parser_run.set_defaults(func=run)
 
73
 
74
 
75
  def gui(args):
76
+ """Run interface"""
77
+ if args.streamlit:
78
+ cmd = "streamlit run words2wisdom/gui_streamlit.py".split()
79
+ else:
80
+ cmd = "python -m words2wisdom.gui".split()
81
+
82
+ subprocess.run(cmd)
83
 
84
 
85
  def run(args):
src/words2wisdom/gui.py CHANGED
@@ -1,18 +1,21 @@
1
  import os
 
 
 
2
 
3
  import gradio as gr
4
 
5
- from . import CONFIG_DIR, ROOT
6
- from .config import Config
7
- from .pipeline import Pipeline
8
- from .utils import dump_all
9
 
10
 
11
- example_file = (os.path.join(ROOT, "demo", "prokaryotes.txt"))
12
  example_text = "The quick brown fox jumps over the lazy dog. The cat sits on the mat."
13
 
14
 
15
- def text2kg_from_string(openai_api_key: str, input_text: str):
16
 
17
  config = Config.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
18
  config.llm["openai_api_key"] = openai_api_key
@@ -25,15 +28,15 @@ def text2kg_from_string(openai_api_key: str, input_text: str):
25
  return knowledge_graph, zip_path
26
 
27
 
28
- def text2kg_from_file(openai_api_key: str, input_file):
29
  with open(input_file.name) as f:
30
  input_text = f.read()
31
 
32
- return text2kg_from_string(openai_api_key, input_text)
33
 
34
 
35
- with gr.Blocks(title="Text2KG") as demo:
36
- gr.Markdown("# πŸ§žπŸ“– Text2KG")
37
 
38
  with gr.Column(variant="panel"):
39
  openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password")
@@ -64,7 +67,7 @@ with gr.Blocks(title="Text2KG") as demo:
64
  examples=[[None, example_text]],
65
  inputs=[openai_api_key, text_string],
66
  outputs=[output_graph, output_zip],
67
- fn=text2kg_from_string,
68
  preprocess=False,
69
  postprocess=False
70
  )
@@ -74,13 +77,13 @@ with gr.Blocks(title="Text2KG") as demo:
74
  examples=[[None, example_file]],
75
  inputs=[openai_api_key, text_file],
76
  outputs=[output_graph, output_zip],
77
- fn=text2kg_from_file,
78
  preprocess=False,
79
  postprocess=False
80
  )
81
 
82
- submit_str.click(fn=text2kg_from_string, inputs=[openai_api_key, text_string], outputs=[output_graph, output_zip])
83
- submit_file.click(fn=text2kg_from_file, inputs=[openai_api_key, text_file], outputs=[output_graph, output_zip])
84
 
85
 
86
  demo.launch(inbrowser=True, width="75%")
 
1
  import os
2
+ import sys
3
+
4
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
5
 
6
  import gradio as gr
7
 
8
+ from words2wisdom import CONFIG_DIR, ROOT
9
+ from words2wisdom.config import Config
10
+ from words2wisdom.pipeline import Pipeline
11
+ from words2wisdom.utils import dump_all
12
 
13
 
14
+ example_file = (os.path.join(ROOT, "demo", "example.txt"))
15
  example_text = "The quick brown fox jumps over the lazy dog. The cat sits on the mat."
16
 
17
 
18
+ def w2w_from_string(openai_api_key: str, input_text: str):
19
 
20
  config = Config.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
21
  config.llm["openai_api_key"] = openai_api_key
 
28
  return knowledge_graph, zip_path
29
 
30
 
31
+ def w2w_from_file(openai_api_key: str, input_file):
32
  with open(input_file.name) as f:
33
  input_text = f.read()
34
 
35
+ return w2w_from_string(openai_api_key, input_text)
36
 
37
 
38
+ with gr.Blocks(title="Words2Wisdom") as demo:
39
+ gr.Markdown("# πŸ§žπŸ“– Words2Wisdom")
40
 
41
  with gr.Column(variant="panel"):
42
  openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password")
 
67
  examples=[[None, example_text]],
68
  inputs=[openai_api_key, text_string],
69
  outputs=[output_graph, output_zip],
70
+ fn=w2w_from_string,
71
  preprocess=False,
72
  postprocess=False
73
  )
 
77
  examples=[[None, example_file]],
78
  inputs=[openai_api_key, text_file],
79
  outputs=[output_graph, output_zip],
80
+ fn=w2w_from_file,
81
  preprocess=False,
82
  postprocess=False
83
  )
84
 
85
+ submit_str.click(fn=w2w_from_string, inputs=[openai_api_key, text_string], outputs=[output_graph, output_zip])
86
+ submit_file.click(fn=w2w_from_file, inputs=[openai_api_key, text_file], outputs=[output_graph, output_zip])
87
 
88
 
89
  demo.launch(inbrowser=True, width="75%")
src/words2wisdom/gui_streamlit.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import sys
4
+ from zipfile import ZipFile
5
+
6
+ sys.path.append(os.path.dirname(os.path.dirname(__file__)))
7
+
8
+ import pandas as pd
9
+ import streamlit as st
10
+ import streamlit.components.v1 as st_components
11
+ from pandas import DataFrame
12
+ from pyvis.network import Network
13
+
14
+ from words2wisdom import CONFIG_DIR
15
+ from words2wisdom.config import Config as W2WConfig
16
+ from words2wisdom.pipeline import Pipeline
17
+
18
+
19
+ def create_graph(df: DataFrame):
20
+ graph = Network(directed=True)
21
+
22
+ entities = pd.concat([df.subject, df.object]).unique()
23
+
24
+ graph.add_nodes(entities, label=entities, title=entities)
25
+
26
+ df_iterable = (
27
+ df.drop_duplicates(
28
+ subset=["subject", "relation", "object"]
29
+ )
30
+ .iterrows()
31
+ )
32
+
33
+ for _, row in df_iterable:
34
+ graph.add_edge(row.subject, row.object, label=row.relation)
35
+
36
+ graph.save_graph("/tmp/graph.html")
37
+ HtmlFile = open("/tmp/graph.html")
38
+
39
+ return st_components.html(HtmlFile.read(), height=625)
40
+
41
+
42
+ @st.cache_data
43
+ def create_zip_bytes(file_contents):
44
+ buffer = io.BytesIO()
45
+ with ZipFile(buffer, 'w') as zip_file:
46
+ for filename, content in file_contents.items():
47
+ zip_file.writestr(filename, content)
48
+ return buffer.getvalue()
49
+
50
+
51
+ st.set_page_config(page_title="Words2Wisdom",
52
+ page_icon="πŸ“–")
53
+ st.title("πŸ“– Words2Wisdom")
54
+ st.write("Generate knowledge graphs from unstructured text using GPT.")
55
+
56
+ # parameters
57
+ with st.sidebar:
58
+ st.title("Parameters")
59
+
60
+ st.write("The API Key is required. Feel free to customize the other parameters, if you'd like!")
61
+
62
+ openai_api_key = st.text_input(
63
+ label="πŸ” **OpenAI API Key**",
64
+ type="password",
65
+ help="Learn how to get your own [here](https://platform.openai.com/docs/api-reference/authentication)."
66
+ )
67
+ st.divider()
68
+
69
+ with st.expander("🚰 **Pipeline parameters**"):
70
+
71
+ formatter = lambda x: x.replace("_", " ").title()
72
+
73
+ words_per_batch = st.number_input(
74
+ label="Words per Batch",
75
+ min_value=0,
76
+ max_value=200,
77
+ value=150,
78
+ help="Batch text into paragraphs containing at least N words, if possible."
79
+ )
80
+
81
+ preprocess = st.selectbox(
82
+ label="Preprocess",
83
+ options=("None", "clause_deconstruction"),
84
+ index=1,
85
+ format_func=formatter,
86
+ help="Method for text simplification."
87
+ )
88
+
89
+ extraction = st.selectbox(
90
+ label="Generation",
91
+ options=("triplet_extraction",),
92
+ index=0,
93
+ format_func=formatter,
94
+ help="Method for KG generation."
95
+ )
96
+
97
+ with st.expander("πŸ€– **LLM parameters**"):
98
+ model = st.selectbox(
99
+ label="Model",
100
+ options=("gpt-3.5-turbo",),
101
+ index=0,
102
+ help="ID of the model to use."
103
+ )
104
+
105
+ temperature = st.slider(
106
+ label="Temperature",
107
+ min_value=0.0,
108
+ max_value=2.0,
109
+ value=1.0,
110
+ step=0.1,
111
+ format="%.1f",
112
+ help=(
113
+ "What sampling temperature to use."
114
+ " Higher values will make the output more random;"
115
+ " lower values will make it more focused/deterministic."
116
+ )
117
+ )
118
+
119
+
120
+ # API Key warning
121
+ if not openai_api_key.startswith('sk-'):
122
+ st.warning('Please enter your OpenAI API key.', icon='⚠️')
123
+
124
+
125
+ # tabs
126
+ tab1, tab2 = st.tabs(["Input Text", "File Upload"])
127
+
128
+
129
+ # text input tab
130
+ with tab1:
131
+ text1 = st.text_area(label="Enter text:")
132
+ submitted1 = tab1.button(label="Generate!", use_container_width=True)
133
+
134
+
135
+ # file upload tab
136
+ with tab2:
137
+ file2 = tab2.file_uploader(label="Upload text file:", type="txt")
138
+ submitted2 = tab2.button(key="filebtn", label="Generate!", use_container_width=True)
139
+
140
+
141
+ # w2w config
142
+ w2w_config = W2WConfig.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
143
+ w2w_config.pipeline = {
144
+ "words_per_batch": words_per_batch,
145
+ "preprocess": [] if preprocess == "None" else [preprocess],
146
+ "extraction": extraction
147
+ }
148
+ w2w_config.llm["openai_api_key"] = openai_api_key
149
+
150
+
151
+ # main logic
152
+ if (submitted1 or submitted2) and openai_api_key.startswith("sk-"):
153
+ with st.status("Generating knowledge graph..."):
154
+ st.write("Initializing pipeline...")
155
+ pipe = Pipeline(w2w_config)
156
+ st.write("Executing pipeline...")
157
+
158
+ if submitted1:
159
+ text = text1
160
+ elif submitted2:
161
+ text = file2.read().decode()
162
+
163
+ text_batches, knowledge_graph = pipe.run(text)
164
+ st.write("Complete.")
165
+
166
+ st.divider()
167
+
168
+ kg_viz = create_graph(knowledge_graph)
169
+
170
+ st.error("**Warning:** The page will refresh when you download the data!", icon="🚨")
171
+
172
+ download = st.download_button(
173
+ label="Download data",
174
+ data=create_zip_bytes({
175
+ "text_batches.csv": (
176
+ DataFrame(text_batches, columns=["text"])
177
+ .to_csv(index_label="batch_id")
178
+ ),
179
+ "kg.csv": knowledge_graph.to_csv(index=False),
180
+ "config.ini": pipe.serialize()
181
+ }),
182
+ file_name="output.zip",
183
+ use_container_width=True,
184
+ type="primary"
185
+ )
186
+
src/words2wisdom/pipeline.py CHANGED
@@ -23,7 +23,7 @@ PARSERS = {
23
 
24
 
25
  class Module:
26
- """Text2KG module class."""
27
  def __init__(self, name: str) -> None:
28
  self.name = name
29
  self.parser = self.get_parser()
@@ -44,7 +44,7 @@ class Module:
44
 
45
 
46
  class Pipeline:
47
- """Text2KG pipeline class."""
48
 
49
  def __init__(self, config: Config):
50
 
@@ -57,7 +57,7 @@ class Pipeline:
57
 
58
 
59
  def __repr__(self) -> str:
60
- return f"Text2KG(\n\tconfig.pipeline={self.config.pipeline}\n\tconfig.llm={self.config.llm}\n)"
61
 
62
 
63
  def __str__(self) -> str:
@@ -72,7 +72,7 @@ class Pipeline:
72
 
73
 
74
  def initialize(self, config: Config):
75
- """Initialize Text2KG pipeline from config."""
76
 
77
  # validate preprocess
78
  preprocess_modules = [Module(name) for name in config.pipeline["preprocess"]]
@@ -110,12 +110,12 @@ class Pipeline:
110
  self.pipeline = {"text": self.pipeline} | chains[i]
111
 
112
  # print pipeline
113
- print("Initialized Text2KG pipeline:")
114
  print(str(self))
115
 
116
 
117
  def run(self, text: str, clean=True) -> tuple[List[str], pd.DataFrame]:
118
- """Run Text2KG pipeline on passed text.
119
 
120
  Args:
121
  *texts (str): The text inputs
@@ -126,7 +126,7 @@ class Pipeline:
126
  knowledge_graph (DataFrame): A dataframe containing the extracted KG triplets,
127
  indexed by batch
128
  """
129
- print("Running Text2KG pipeline:")
130
  # split text into batches
131
  text_batches = list(partition_sentences(
132
  sentences=sent_tokenize(text),
@@ -150,7 +150,7 @@ class Pipeline:
150
 
151
 
152
  def _clean(self, kg: pd.DataFrame) -> pd.DataFrame:
153
- """Text2KG post-processing."""
154
  print("Cleaning knowledge graph components...", end=' ')
155
  drop_list = []
156
 
 
23
 
24
 
25
  class Module:
26
+ """Words2Wisdom module class."""
27
  def __init__(self, name: str) -> None:
28
  self.name = name
29
  self.parser = self.get_parser()
 
44
 
45
 
46
  class Pipeline:
47
+ """Words2Wisdom pipeline class."""
48
 
49
  def __init__(self, config: Config):
50
 
 
57
 
58
 
59
  def __repr__(self) -> str:
60
+ return f"Words2Wisdom(\n\tconfig.pipeline={self.config.pipeline}\n\tconfig.llm={self.config.llm}\n)"
61
 
62
 
63
  def __str__(self) -> str:
 
72
 
73
 
74
  def initialize(self, config: Config):
75
+ """Initialize Words2Wisdom pipeline from config."""
76
 
77
  # validate preprocess
78
  preprocess_modules = [Module(name) for name in config.pipeline["preprocess"]]
 
110
  self.pipeline = {"text": self.pipeline} | chains[i]
111
 
112
  # print pipeline
113
+ print("Initialized Words2Wisdom pipeline:")
114
  print(str(self))
115
 
116
 
117
  def run(self, text: str, clean=True) -> tuple[List[str], pd.DataFrame]:
118
+ """Run Words2Wisdom pipeline on passed text.
119
 
120
  Args:
121
  *texts (str): The text inputs
 
126
  knowledge_graph (DataFrame): A dataframe containing the extracted KG triplets,
127
  indexed by batch
128
  """
129
+ print("Running Words2Wisdom pipeline:")
130
  # split text into batches
131
  text_batches = list(partition_sentences(
132
  sentences=sent_tokenize(text),
 
150
 
151
 
152
  def _clean(self, kg: pd.DataFrame) -> pd.DataFrame:
153
+ """Words2Wisdom post-processing."""
154
  print("Cleaning knowledge graph components...", end=' ')
155
  drop_list = []
156
 
src/words2wisdom/validate.py CHANGED
@@ -37,7 +37,7 @@ def format_system_prompt():
37
  + " "
38
  + (question["additional"] + " " if question["additional"] else "")
39
  + "("
40
- + ";".join(question["options"])
41
  + ")\n"
42
  )
43
  return formatted
 
37
  + " "
38
  + (question["additional"] + " " if question["additional"] else "")
39
  + "("
40
+ + "; ".join(question["options"])
41
  + ")\n"
42
  )
43
  return formatted
writeup/words2wisdom_poster.pdf ADDED
Binary file (426 kB). View file