Spaces:
Running
Running
johaunh
commited on
Commit
Β·
3273f67
1
Parent(s):
34ecdc2
Add streamlit app
Browse files- config/validation.yml +8 -8
- src/words2wisdom/cli.py +19 -9
- src/words2wisdom/gui.py +17 -14
- src/words2wisdom/gui_streamlit.py +186 -0
- src/words2wisdom/pipeline.py +8 -8
- src/words2wisdom/validate.py +1 -1
- writeup/words2wisdom_poster.pdf +0 -0
config/validation.yml
CHANGED
@@ -6,19 +6,19 @@ instruction: >
|
|
6 |
an object entity o (format: [s, r, o]). We emphasize that the order of these
|
7 |
components is significant; the subject s relates to the object o via the
|
8 |
relation r. That is, the relation points from the subject to the object. Our
|
9 |
-
AI agent extracts a collection of triplets from each passage provided.
|
10 |
-
|
11 |
questions:
|
12 |
Q1:
|
13 |
title: Specificity (subject entity).
|
14 |
-
text:
|
15 |
additional:
|
16 |
options:
|
17 |
- 1 = Specific
|
18 |
- 0 = Not specific
|
19 |
Q2:
|
20 |
title: Specificity (object entity).
|
21 |
-
text:
|
22 |
additional:
|
23 |
options:
|
24 |
- 1 = Specific
|
@@ -54,8 +54,8 @@ example:
|
|
54 |
system).
|
55 |
triplet: "['organ', 'such as', 'heart']"
|
56 |
answers:
|
57 |
-
Q1: 1 - The entity 'organ' is specific and
|
58 |
-
Q2: 1 - The entity 'heart' is specific and
|
59 |
-
Q3:
|
60 |
Q4: 1 - The triplet is relatively important as it is used as a parenthetical example in the passage.
|
61 |
-
Q5: 0 - The provided triplet is unclear as is. "Organ such as heart"
|
|
|
6 |
an object entity o (format: [s, r, o]). We emphasize that the order of these
|
7 |
components is significant; the subject s relates to the object o via the
|
8 |
relation r. That is, the relation points from the subject to the object. Our
|
9 |
+
AI agent extracts a collection of triplets from each passage provided. Please
|
10 |
+
answer the following questions for each triplet provided.
|
11 |
questions:
|
12 |
Q1:
|
13 |
title: Specificity (subject entity).
|
14 |
+
text: Is the subject entity a suitable glossary term for a textbook?
|
15 |
additional:
|
16 |
options:
|
17 |
- 1 = Specific
|
18 |
- 0 = Not specific
|
19 |
Q2:
|
20 |
title: Specificity (object entity).
|
21 |
+
text: Is the object entity a suitable glossary term for a textbook?
|
22 |
additional:
|
23 |
options:
|
24 |
- 1 = Specific
|
|
|
54 |
system).
|
55 |
triplet: "['organ', 'such as', 'heart']"
|
56 |
answers:
|
57 |
+
Q1: 1 - The entity 'organ' is specific and would likely appear in a textbook glossary.
|
58 |
+
Q2: 1 - The entity 'heart' is specific and would likely appear in a textbook glossary.
|
59 |
+
Q3: 0 - The relation 'such as' is unclear. A better relation would be 'superclass of'.
|
60 |
Q4: 1 - The triplet is relatively important as it is used as a parenthetical example in the passage.
|
61 |
+
Q5: 0 - The provided triplet is unclear as is. "Organ such as heart" does not make sense.
|
src/words2wisdom/cli.py
CHANGED
@@ -16,21 +16,25 @@ default_config_path = os.path.join(CONFIG_DIR, "default_config.ini")
|
|
16 |
def main():
|
17 |
parser = argparse.ArgumentParser(
|
18 |
prog="words2wisdom",
|
19 |
-
|
20 |
)
|
21 |
subparsers = parser.add_subparsers(dest="command",
|
22 |
help="Available commands")
|
23 |
|
24 |
# init
|
25 |
parser_init = subparsers.add_parser("init",
|
26 |
-
|
27 |
-
|
|
|
28 |
parser_init.set_defaults(func=get_default_config)
|
29 |
|
30 |
# gui
|
31 |
parser_gui = subparsers.add_parser("gui",
|
32 |
-
help="
|
33 |
-
description="
|
|
|
|
|
|
|
34 |
parser_gui.set_defaults(func=gui)
|
35 |
|
36 |
# run
|
@@ -39,10 +43,11 @@ def main():
|
|
39 |
description="Generate a knowledge graph from a given text using OpenAI LLMs")
|
40 |
parser_run.add_argument("text",
|
41 |
help="Path to text file")
|
42 |
-
parser_run.add_argument("--config",
|
43 |
help="Path to config.ini file",
|
44 |
default=default_config_path)
|
45 |
-
parser_run.add_argument("--output-dir",
|
|
|
46 |
help="Path to save outputs to",
|
47 |
default=OUTPUT_DIR)
|
48 |
parser_run.set_defaults(func=run)
|
@@ -68,8 +73,13 @@ def get_default_config(args):
|
|
68 |
|
69 |
|
70 |
def gui(args):
|
71 |
-
"""Run
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
|
75 |
def run(args):
|
|
|
16 |
def main():
|
17 |
parser = argparse.ArgumentParser(
|
18 |
prog="words2wisdom",
|
19 |
+
description="Knowledge graph generation utilities using OpenAI LLMs"
|
20 |
)
|
21 |
subparsers = parser.add_subparsers(dest="command",
|
22 |
help="Available commands")
|
23 |
|
24 |
# init
|
25 |
parser_init = subparsers.add_parser("init",
|
26 |
+
usage="words2wisdom init [> PATH/TO/WRITE/CONFIG.INI]",
|
27 |
+
help="Initialize a template config.ini file",
|
28 |
+
description="Initialize a template config.ini file. Redirect to a new file using the '>' symbol.")
|
29 |
parser_init.set_defaults(func=get_default_config)
|
30 |
|
31 |
# gui
|
32 |
parser_gui = subparsers.add_parser("gui",
|
33 |
+
help="Use Words2Wisdom via Gradio interface",
|
34 |
+
description="use Words2Wisdom using Gradio interface")
|
35 |
+
parser_gui.add_argument("-s", "--streamlit",
|
36 |
+
action="store_true",
|
37 |
+
help="Use Streamlit GUI instead of Gradio GUI")
|
38 |
parser_gui.set_defaults(func=gui)
|
39 |
|
40 |
# run
|
|
|
43 |
description="Generate a knowledge graph from a given text using OpenAI LLMs")
|
44 |
parser_run.add_argument("text",
|
45 |
help="Path to text file")
|
46 |
+
parser_run.add_argument("-c", "--config",
|
47 |
help="Path to config.ini file",
|
48 |
default=default_config_path)
|
49 |
+
parser_run.add_argument("-o", "--output-dir",
|
50 |
+
metavar="OUTPUT_PATH",
|
51 |
help="Path to save outputs to",
|
52 |
default=OUTPUT_DIR)
|
53 |
parser_run.set_defaults(func=run)
|
|
|
73 |
|
74 |
|
75 |
def gui(args):
|
76 |
+
"""Run interface"""
|
77 |
+
if args.streamlit:
|
78 |
+
cmd = "streamlit run words2wisdom/gui_streamlit.py".split()
|
79 |
+
else:
|
80 |
+
cmd = "python -m words2wisdom.gui".split()
|
81 |
+
|
82 |
+
subprocess.run(cmd)
|
83 |
|
84 |
|
85 |
def run(args):
|
src/words2wisdom/gui.py
CHANGED
@@ -1,18 +1,21 @@
|
|
1 |
import os
|
|
|
|
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
|
5 |
-
from
|
6 |
-
from .config import Config
|
7 |
-
from .pipeline import Pipeline
|
8 |
-
from .utils import dump_all
|
9 |
|
10 |
|
11 |
-
example_file = (os.path.join(ROOT, "demo", "
|
12 |
example_text = "The quick brown fox jumps over the lazy dog. The cat sits on the mat."
|
13 |
|
14 |
|
15 |
-
def
|
16 |
|
17 |
config = Config.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
|
18 |
config.llm["openai_api_key"] = openai_api_key
|
@@ -25,15 +28,15 @@ def text2kg_from_string(openai_api_key: str, input_text: str):
|
|
25 |
return knowledge_graph, zip_path
|
26 |
|
27 |
|
28 |
-
def
|
29 |
with open(input_file.name) as f:
|
30 |
input_text = f.read()
|
31 |
|
32 |
-
return
|
33 |
|
34 |
|
35 |
-
with gr.Blocks(title="
|
36 |
-
gr.Markdown("# π§π
|
37 |
|
38 |
with gr.Column(variant="panel"):
|
39 |
openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password")
|
@@ -64,7 +67,7 @@ with gr.Blocks(title="Text2KG") as demo:
|
|
64 |
examples=[[None, example_text]],
|
65 |
inputs=[openai_api_key, text_string],
|
66 |
outputs=[output_graph, output_zip],
|
67 |
-
fn=
|
68 |
preprocess=False,
|
69 |
postprocess=False
|
70 |
)
|
@@ -74,13 +77,13 @@ with gr.Blocks(title="Text2KG") as demo:
|
|
74 |
examples=[[None, example_file]],
|
75 |
inputs=[openai_api_key, text_file],
|
76 |
outputs=[output_graph, output_zip],
|
77 |
-
fn=
|
78 |
preprocess=False,
|
79 |
postprocess=False
|
80 |
)
|
81 |
|
82 |
-
submit_str.click(fn=
|
83 |
-
submit_file.click(fn=
|
84 |
|
85 |
|
86 |
demo.launch(inbrowser=True, width="75%")
|
|
|
1 |
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
5 |
|
6 |
import gradio as gr
|
7 |
|
8 |
+
from words2wisdom import CONFIG_DIR, ROOT
|
9 |
+
from words2wisdom.config import Config
|
10 |
+
from words2wisdom.pipeline import Pipeline
|
11 |
+
from words2wisdom.utils import dump_all
|
12 |
|
13 |
|
14 |
+
example_file = (os.path.join(ROOT, "demo", "example.txt"))
|
15 |
example_text = "The quick brown fox jumps over the lazy dog. The cat sits on the mat."
|
16 |
|
17 |
|
18 |
+
def w2w_from_string(openai_api_key: str, input_text: str):
|
19 |
|
20 |
config = Config.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
|
21 |
config.llm["openai_api_key"] = openai_api_key
|
|
|
28 |
return knowledge_graph, zip_path
|
29 |
|
30 |
|
31 |
+
def w2w_from_file(openai_api_key: str, input_file):
|
32 |
with open(input_file.name) as f:
|
33 |
input_text = f.read()
|
34 |
|
35 |
+
return w2w_from_string(openai_api_key, input_text)
|
36 |
|
37 |
|
38 |
+
with gr.Blocks(title="Words2Wisdom") as demo:
|
39 |
+
gr.Markdown("# π§π Words2Wisdom")
|
40 |
|
41 |
with gr.Column(variant="panel"):
|
42 |
openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="sk-...", type="password")
|
|
|
67 |
examples=[[None, example_text]],
|
68 |
inputs=[openai_api_key, text_string],
|
69 |
outputs=[output_graph, output_zip],
|
70 |
+
fn=w2w_from_string,
|
71 |
preprocess=False,
|
72 |
postprocess=False
|
73 |
)
|
|
|
77 |
examples=[[None, example_file]],
|
78 |
inputs=[openai_api_key, text_file],
|
79 |
outputs=[output_graph, output_zip],
|
80 |
+
fn=w2w_from_file,
|
81 |
preprocess=False,
|
82 |
postprocess=False
|
83 |
)
|
84 |
|
85 |
+
submit_str.click(fn=w2w_from_string, inputs=[openai_api_key, text_string], outputs=[output_graph, output_zip])
|
86 |
+
submit_file.click(fn=w2w_from_file, inputs=[openai_api_key, text_file], outputs=[output_graph, output_zip])
|
87 |
|
88 |
|
89 |
demo.launch(inbrowser=True, width="75%")
|
src/words2wisdom/gui_streamlit.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
from zipfile import ZipFile
|
5 |
+
|
6 |
+
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
7 |
+
|
8 |
+
import pandas as pd
|
9 |
+
import streamlit as st
|
10 |
+
import streamlit.components.v1 as st_components
|
11 |
+
from pandas import DataFrame
|
12 |
+
from pyvis.network import Network
|
13 |
+
|
14 |
+
from words2wisdom import CONFIG_DIR
|
15 |
+
from words2wisdom.config import Config as W2WConfig
|
16 |
+
from words2wisdom.pipeline import Pipeline
|
17 |
+
|
18 |
+
|
19 |
+
def create_graph(df: DataFrame):
|
20 |
+
graph = Network(directed=True)
|
21 |
+
|
22 |
+
entities = pd.concat([df.subject, df.object]).unique()
|
23 |
+
|
24 |
+
graph.add_nodes(entities, label=entities, title=entities)
|
25 |
+
|
26 |
+
df_iterable = (
|
27 |
+
df.drop_duplicates(
|
28 |
+
subset=["subject", "relation", "object"]
|
29 |
+
)
|
30 |
+
.iterrows()
|
31 |
+
)
|
32 |
+
|
33 |
+
for _, row in df_iterable:
|
34 |
+
graph.add_edge(row.subject, row.object, label=row.relation)
|
35 |
+
|
36 |
+
graph.save_graph("/tmp/graph.html")
|
37 |
+
HtmlFile = open("/tmp/graph.html")
|
38 |
+
|
39 |
+
return st_components.html(HtmlFile.read(), height=625)
|
40 |
+
|
41 |
+
|
42 |
+
@st.cache_data
|
43 |
+
def create_zip_bytes(file_contents):
|
44 |
+
buffer = io.BytesIO()
|
45 |
+
with ZipFile(buffer, 'w') as zip_file:
|
46 |
+
for filename, content in file_contents.items():
|
47 |
+
zip_file.writestr(filename, content)
|
48 |
+
return buffer.getvalue()
|
49 |
+
|
50 |
+
|
51 |
+
st.set_page_config(page_title="Words2Wisdom",
|
52 |
+
page_icon="π")
|
53 |
+
st.title("π Words2Wisdom")
|
54 |
+
st.write("Generate knowledge graphs from unstructured text using GPT.")
|
55 |
+
|
56 |
+
# parameters
|
57 |
+
with st.sidebar:
|
58 |
+
st.title("Parameters")
|
59 |
+
|
60 |
+
st.write("The API Key is required. Feel free to customize the other parameters, if you'd like!")
|
61 |
+
|
62 |
+
openai_api_key = st.text_input(
|
63 |
+
label="π **OpenAI API Key**",
|
64 |
+
type="password",
|
65 |
+
help="Learn how to get your own [here](https://platform.openai.com/docs/api-reference/authentication)."
|
66 |
+
)
|
67 |
+
st.divider()
|
68 |
+
|
69 |
+
with st.expander("π° **Pipeline parameters**"):
|
70 |
+
|
71 |
+
formatter = lambda x: x.replace("_", " ").title()
|
72 |
+
|
73 |
+
words_per_batch = st.number_input(
|
74 |
+
label="Words per Batch",
|
75 |
+
min_value=0,
|
76 |
+
max_value=200,
|
77 |
+
value=150,
|
78 |
+
help="Batch text into paragraphs containing at least N words, if possible."
|
79 |
+
)
|
80 |
+
|
81 |
+
preprocess = st.selectbox(
|
82 |
+
label="Preprocess",
|
83 |
+
options=("None", "clause_deconstruction"),
|
84 |
+
index=1,
|
85 |
+
format_func=formatter,
|
86 |
+
help="Method for text simplification."
|
87 |
+
)
|
88 |
+
|
89 |
+
extraction = st.selectbox(
|
90 |
+
label="Generation",
|
91 |
+
options=("triplet_extraction",),
|
92 |
+
index=0,
|
93 |
+
format_func=formatter,
|
94 |
+
help="Method for KG generation."
|
95 |
+
)
|
96 |
+
|
97 |
+
with st.expander("π€ **LLM parameters**"):
|
98 |
+
model = st.selectbox(
|
99 |
+
label="Model",
|
100 |
+
options=("gpt-3.5-turbo",),
|
101 |
+
index=0,
|
102 |
+
help="ID of the model to use."
|
103 |
+
)
|
104 |
+
|
105 |
+
temperature = st.slider(
|
106 |
+
label="Temperature",
|
107 |
+
min_value=0.0,
|
108 |
+
max_value=2.0,
|
109 |
+
value=1.0,
|
110 |
+
step=0.1,
|
111 |
+
format="%.1f",
|
112 |
+
help=(
|
113 |
+
"What sampling temperature to use."
|
114 |
+
" Higher values will make the output more random;"
|
115 |
+
" lower values will make it more focused/deterministic."
|
116 |
+
)
|
117 |
+
)
|
118 |
+
|
119 |
+
|
120 |
+
# API Key warning
|
121 |
+
if not openai_api_key.startswith('sk-'):
|
122 |
+
st.warning('Please enter your OpenAI API key.', icon='β οΈ')
|
123 |
+
|
124 |
+
|
125 |
+
# tabs
|
126 |
+
tab1, tab2 = st.tabs(["Input Text", "File Upload"])
|
127 |
+
|
128 |
+
|
129 |
+
# text input tab
|
130 |
+
with tab1:
|
131 |
+
text1 = st.text_area(label="Enter text:")
|
132 |
+
submitted1 = tab1.button(label="Generate!", use_container_width=True)
|
133 |
+
|
134 |
+
|
135 |
+
# file upload tab
|
136 |
+
with tab2:
|
137 |
+
file2 = tab2.file_uploader(label="Upload text file:", type="txt")
|
138 |
+
submitted2 = tab2.button(key="filebtn", label="Generate!", use_container_width=True)
|
139 |
+
|
140 |
+
|
141 |
+
# w2w config
|
142 |
+
w2w_config = W2WConfig.read_ini(os.path.join(CONFIG_DIR, "default_config.ini"))
|
143 |
+
w2w_config.pipeline = {
|
144 |
+
"words_per_batch": words_per_batch,
|
145 |
+
"preprocess": [] if preprocess == "None" else [preprocess],
|
146 |
+
"extraction": extraction
|
147 |
+
}
|
148 |
+
w2w_config.llm["openai_api_key"] = openai_api_key
|
149 |
+
|
150 |
+
|
151 |
+
# main logic
|
152 |
+
if (submitted1 or submitted2) and openai_api_key.startswith("sk-"):
|
153 |
+
with st.status("Generating knowledge graph..."):
|
154 |
+
st.write("Initializing pipeline...")
|
155 |
+
pipe = Pipeline(w2w_config)
|
156 |
+
st.write("Executing pipeline...")
|
157 |
+
|
158 |
+
if submitted1:
|
159 |
+
text = text1
|
160 |
+
elif submitted2:
|
161 |
+
text = file2.read().decode()
|
162 |
+
|
163 |
+
text_batches, knowledge_graph = pipe.run(text)
|
164 |
+
st.write("Complete.")
|
165 |
+
|
166 |
+
st.divider()
|
167 |
+
|
168 |
+
kg_viz = create_graph(knowledge_graph)
|
169 |
+
|
170 |
+
st.error("**Warning:** The page will refresh when you download the data!", icon="π¨")
|
171 |
+
|
172 |
+
download = st.download_button(
|
173 |
+
label="Download data",
|
174 |
+
data=create_zip_bytes({
|
175 |
+
"text_batches.csv": (
|
176 |
+
DataFrame(text_batches, columns=["text"])
|
177 |
+
.to_csv(index_label="batch_id")
|
178 |
+
),
|
179 |
+
"kg.csv": knowledge_graph.to_csv(index=False),
|
180 |
+
"config.ini": pipe.serialize()
|
181 |
+
}),
|
182 |
+
file_name="output.zip",
|
183 |
+
use_container_width=True,
|
184 |
+
type="primary"
|
185 |
+
)
|
186 |
+
|
src/words2wisdom/pipeline.py
CHANGED
@@ -23,7 +23,7 @@ PARSERS = {
|
|
23 |
|
24 |
|
25 |
class Module:
|
26 |
-
"""
|
27 |
def __init__(self, name: str) -> None:
|
28 |
self.name = name
|
29 |
self.parser = self.get_parser()
|
@@ -44,7 +44,7 @@ class Module:
|
|
44 |
|
45 |
|
46 |
class Pipeline:
|
47 |
-
"""
|
48 |
|
49 |
def __init__(self, config: Config):
|
50 |
|
@@ -57,7 +57,7 @@ class Pipeline:
|
|
57 |
|
58 |
|
59 |
def __repr__(self) -> str:
|
60 |
-
return f"
|
61 |
|
62 |
|
63 |
def __str__(self) -> str:
|
@@ -72,7 +72,7 @@ class Pipeline:
|
|
72 |
|
73 |
|
74 |
def initialize(self, config: Config):
|
75 |
-
"""Initialize
|
76 |
|
77 |
# validate preprocess
|
78 |
preprocess_modules = [Module(name) for name in config.pipeline["preprocess"]]
|
@@ -110,12 +110,12 @@ class Pipeline:
|
|
110 |
self.pipeline = {"text": self.pipeline} | chains[i]
|
111 |
|
112 |
# print pipeline
|
113 |
-
print("Initialized
|
114 |
print(str(self))
|
115 |
|
116 |
|
117 |
def run(self, text: str, clean=True) -> tuple[List[str], pd.DataFrame]:
|
118 |
-
"""Run
|
119 |
|
120 |
Args:
|
121 |
*texts (str): The text inputs
|
@@ -126,7 +126,7 @@ class Pipeline:
|
|
126 |
knowledge_graph (DataFrame): A dataframe containing the extracted KG triplets,
|
127 |
indexed by batch
|
128 |
"""
|
129 |
-
print("Running
|
130 |
# split text into batches
|
131 |
text_batches = list(partition_sentences(
|
132 |
sentences=sent_tokenize(text),
|
@@ -150,7 +150,7 @@ class Pipeline:
|
|
150 |
|
151 |
|
152 |
def _clean(self, kg: pd.DataFrame) -> pd.DataFrame:
|
153 |
-
"""
|
154 |
print("Cleaning knowledge graph components...", end=' ')
|
155 |
drop_list = []
|
156 |
|
|
|
23 |
|
24 |
|
25 |
class Module:
|
26 |
+
"""Words2Wisdom module class."""
|
27 |
def __init__(self, name: str) -> None:
|
28 |
self.name = name
|
29 |
self.parser = self.get_parser()
|
|
|
44 |
|
45 |
|
46 |
class Pipeline:
|
47 |
+
"""Words2Wisdom pipeline class."""
|
48 |
|
49 |
def __init__(self, config: Config):
|
50 |
|
|
|
57 |
|
58 |
|
59 |
def __repr__(self) -> str:
|
60 |
+
return f"Words2Wisdom(\n\tconfig.pipeline={self.config.pipeline}\n\tconfig.llm={self.config.llm}\n)"
|
61 |
|
62 |
|
63 |
def __str__(self) -> str:
|
|
|
72 |
|
73 |
|
74 |
def initialize(self, config: Config):
|
75 |
+
"""Initialize Words2Wisdom pipeline from config."""
|
76 |
|
77 |
# validate preprocess
|
78 |
preprocess_modules = [Module(name) for name in config.pipeline["preprocess"]]
|
|
|
110 |
self.pipeline = {"text": self.pipeline} | chains[i]
|
111 |
|
112 |
# print pipeline
|
113 |
+
print("Initialized Words2Wisdom pipeline:")
|
114 |
print(str(self))
|
115 |
|
116 |
|
117 |
def run(self, text: str, clean=True) -> tuple[List[str], pd.DataFrame]:
|
118 |
+
"""Run Words2Wisdom pipeline on passed text.
|
119 |
|
120 |
Args:
|
121 |
*texts (str): The text inputs
|
|
|
126 |
knowledge_graph (DataFrame): A dataframe containing the extracted KG triplets,
|
127 |
indexed by batch
|
128 |
"""
|
129 |
+
print("Running Words2Wisdom pipeline:")
|
130 |
# split text into batches
|
131 |
text_batches = list(partition_sentences(
|
132 |
sentences=sent_tokenize(text),
|
|
|
150 |
|
151 |
|
152 |
def _clean(self, kg: pd.DataFrame) -> pd.DataFrame:
|
153 |
+
"""Words2Wisdom post-processing."""
|
154 |
print("Cleaning knowledge graph components...", end=' ')
|
155 |
drop_list = []
|
156 |
|
src/words2wisdom/validate.py
CHANGED
@@ -37,7 +37,7 @@ def format_system_prompt():
|
|
37 |
+ " "
|
38 |
+ (question["additional"] + " " if question["additional"] else "")
|
39 |
+ "("
|
40 |
-
+ ";".join(question["options"])
|
41 |
+ ")\n"
|
42 |
)
|
43 |
return formatted
|
|
|
37 |
+ " "
|
38 |
+ (question["additional"] + " " if question["additional"] else "")
|
39 |
+ "("
|
40 |
+
+ "; ".join(question["options"])
|
41 |
+ ")\n"
|
42 |
)
|
43 |
return formatted
|
writeup/words2wisdom_poster.pdf
ADDED
Binary file (426 kB). View file
|
|