Spaces:
Sleeping
Sleeping
Merge branch 'main' of https://huggingface.co/spaces/jhatchett/Text2KG
Browse files- main.py +16 -0
- requirements.txt +1 -1
main.py
CHANGED
@@ -3,6 +3,7 @@ import re
|
|
3 |
import secrets
|
4 |
import string
|
5 |
import yaml
|
|
|
6 |
from datetime import datetime
|
7 |
from zipfile import ZipFile
|
8 |
|
@@ -10,6 +11,7 @@ import gradio as gr
|
|
10 |
import nltk
|
11 |
import pandas as pd
|
12 |
from langchain.embeddings import OpenAIEmbeddings
|
|
|
13 |
from langchain.chains import SimpleSequentialChain
|
14 |
from langchain.chat_models import ChatOpenAI
|
15 |
from nltk.tokenize import sent_tokenize
|
@@ -139,6 +141,14 @@ def extract_knowledge_graph(api_key: str, batch_size: int, modules: list[str], t
|
|
139 |
|
140 |
steps = []
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
for module in modules:
|
143 |
m = module.lower().replace(' ', '_')
|
144 |
steps.append(m)
|
@@ -148,10 +158,13 @@ def extract_knowledge_graph(api_key: str, batch_size: int, modules: list[str], t
|
|
148 |
|
149 |
pipeline.init(steps)
|
150 |
|
|
|
151 |
# split text into batches
|
152 |
sentences = sent_tokenize(text)
|
153 |
batches = [" ".join(sentences[i:i+batch_size])
|
154 |
for i in range(0, len(sentences), batch_size)]
|
|
|
|
|
155 |
|
156 |
# create KG
|
157 |
knowledge_graph = []
|
@@ -179,6 +192,9 @@ def extract_knowledge_graph(api_key: str, batch_size: int, modules: list[str], t
|
|
179 |
"_timestamp": now,
|
180 |
"batch_size": batch_size,
|
181 |
"modules": steps
|
|
|
|
|
|
|
182 |
}
|
183 |
|
184 |
# unique identifier for local saving
|
|
|
3 |
import secrets
|
4 |
import string
|
5 |
import yaml
|
6 |
+
import yaml
|
7 |
from datetime import datetime
|
8 |
from zipfile import ZipFile
|
9 |
|
|
|
11 |
import nltk
|
12 |
import pandas as pd
|
13 |
from langchain.embeddings import OpenAIEmbeddings
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings
|
15 |
from langchain.chains import SimpleSequentialChain
|
16 |
from langchain.chat_models import ChatOpenAI
|
17 |
from nltk.tokenize import sent_tokenize
|
|
|
141 |
|
142 |
steps = []
|
143 |
|
144 |
+
for module in modules:
|
145 |
+
m = module.lower().replace(' ', '_')
|
146 |
+
steps.append(m)
|
147 |
+
|
148 |
+
if (len(steps) == 0) or (steps[-1] != "triplet_extraction"):
|
149 |
+
steps.append("triplet_extraction")
|
150 |
+
steps = []
|
151 |
+
|
152 |
for module in modules:
|
153 |
m = module.lower().replace(' ', '_')
|
154 |
steps.append(m)
|
|
|
158 |
|
159 |
pipeline.init(steps)
|
160 |
|
161 |
+
# split text into batches
|
162 |
# split text into batches
|
163 |
sentences = sent_tokenize(text)
|
164 |
batches = [" ".join(sentences[i:i+batch_size])
|
165 |
for i in range(0, len(sentences), batch_size)]
|
166 |
+
batches = [" ".join(sentences[i:i+batch_size])
|
167 |
+
for i in range(0, len(sentences), batch_size)]
|
168 |
|
169 |
# create KG
|
170 |
knowledge_graph = []
|
|
|
192 |
"_timestamp": now,
|
193 |
"batch_size": batch_size,
|
194 |
"modules": steps
|
195 |
+
"_timestamp": now,
|
196 |
+
"batch_size": batch_size,
|
197 |
+
"modules": steps
|
198 |
}
|
199 |
|
200 |
# unique identifier for local saving
|
requirements.txt
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
gradio==4.2.0
|
2 |
langchain==0.0.335
|
3 |
-
matplotlib==3.7.1
|
4 |
nltk==3.7
|
5 |
openai==0.27.4
|
6 |
pandas==2.0.3
|
7 |
PyYAML==6.0
|
|
|
8 |
tqdm==4.65.0
|
|
|
1 |
gradio==4.2.0
|
2 |
langchain==0.0.335
|
|
|
3 |
nltk==3.7
|
4 |
openai==0.27.4
|
5 |
pandas==2.0.3
|
6 |
PyYAML==6.0
|
7 |
+
scikit-learn==1.2.2
|
8 |
tqdm==4.65.0
|