0.0.9 reference text term frequency
Browse files- app.py +26 -19
- graph.html +2 -2
app.py
CHANGED
@@ -73,10 +73,7 @@ f'''
|
|
73 |
f'''
|
74 |
#### How can <what most are doing> help with <what few are doing>?
|
75 |
''')
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
from llama_index import StorageContext
|
81 |
from llama_index import ServiceContext
|
82 |
from llama_index import load_index_from_storage
|
@@ -325,28 +322,37 @@ if(st.session_state.question):
|
|
325 |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
326 |
import matplotlib.pyplot as plt
|
327 |
from PIL import Image
|
328 |
-
wc_all,
|
329 |
-
wordcloud = WordCloud(max_font_size=50, max_words=1000, background_color="white")
|
330 |
with wc_all:
|
331 |
-
#st.write('''### Corpus''')
|
332 |
image = Image.open('docs/images/all_papers_wordcloud.png')
|
333 |
st.image(image)
|
334 |
-
st.caption('''###### Corpus
|
335 |
-
with
|
336 |
-
|
337 |
-
st.image(
|
338 |
-
st.caption('''######
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
with explainable:
|
340 |
#st.write(answer.source_nodes)
|
341 |
from pyvis.network import Network
|
342 |
graph = Network(height="450px", width="100%")
|
343 |
sources_table = []
|
|
|
344 |
for nodewithscore in answer.source_nodes:
|
345 |
node = nodewithscore.node
|
346 |
from llama_index.schema import NodeRelationship
|
347 |
if NodeRelationship.SOURCE in node.relationships:
|
348 |
-
#st.write(node.relationships[NodeRelationship.SOURCE].node_id)
|
349 |
-
#st.write(node.text)
|
350 |
node_id = node.relationships[NodeRelationship.SOURCE].node_id
|
351 |
node_id = node_id.split('/')[-1]
|
352 |
title = node_id.split('.')[2].replace('_', ' ')
|
@@ -354,9 +360,9 @@ if(st.session_state.question):
|
|
354 |
link = f'https://arxiv.org/abs/{link}'
|
355 |
href = f'<a target="_blank" href="{link}">{title}</a>'
|
356 |
sources_table.extend([[href, node.text]])
|
|
|
357 |
else:
|
358 |
-
#st.write(node.
|
359 |
-
#st.write(node.text) TODO
|
360 |
rel_map = node.metadata['kg_rel_map']
|
361 |
for concept in rel_map.keys():
|
362 |
#st.write(concept)
|
@@ -365,13 +371,14 @@ if(st.session_state.question):
|
|
365 |
for rel in rels:
|
366 |
graph.add_node(rel[1], rel[1], title=rel[1])
|
367 |
graph.add_edge(concept, rel[1], title=rel[0])
|
|
|
368 |
st.session_state.graph_name = 'graph.html'
|
369 |
graph.save_graph(st.session_state.graph_name)
|
370 |
import streamlit.components.v1 as components
|
371 |
graphHtml = open(st.session_state.graph_name, 'r', encoding='utf-8')
|
372 |
source_code = graphHtml.read()
|
373 |
-
#print(source_code)
|
374 |
components.html(source_code, height = 500)
|
|
|
375 |
import pandas as pd
|
376 |
df = pd.DataFrame(sources_table)
|
377 |
df.columns = ['paper', 'relevant text']
|
@@ -381,8 +388,8 @@ if(st.session_state.question):
|
|
381 |
}
|
382 |
</style> """, unsafe_allow_html=True)
|
383 |
st.write(df.to_html(escape=False), unsafe_allow_html=True)
|
384 |
-
#
|
385 |
-
#st.
|
386 |
|
387 |
except Exception as e:
|
388 |
#print(f'{type(e)}, {e}')
|
|
|
73 |
f'''
|
74 |
#### How can <what most are doing> help with <what few are doing>?
|
75 |
''')
|
76 |
+
|
|
|
|
|
|
|
77 |
from llama_index import StorageContext
|
78 |
from llama_index import ServiceContext
|
79 |
from llama_index import load_index_from_storage
|
|
|
322 |
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
323 |
import matplotlib.pyplot as plt
|
324 |
from PIL import Image
|
325 |
+
wc_all, wc_question, wc_reference = st.columns([3, 3, 3])
|
326 |
+
wordcloud = WordCloud(max_font_size=50, max_words=1000, background_color="white")
|
327 |
with wc_all:
|
|
|
328 |
image = Image.open('docs/images/all_papers_wordcloud.png')
|
329 |
st.image(image)
|
330 |
+
st.caption('''###### Corpus term frequecy.''')
|
331 |
+
with wc_question:
|
332 |
+
wordcloud_q = wordcloud.generate(answer_str)
|
333 |
+
st.image(wordcloud_q.to_array())
|
334 |
+
st.caption('''###### Answer term frequecy.''')
|
335 |
+
with wc_reference:
|
336 |
+
all_reference_texts = ''
|
337 |
+
for nodewithscore in answer.source_nodes:
|
338 |
+
node = nodewithscore.node
|
339 |
+
from llama_index.schema import NodeRelationship
|
340 |
+
#if NodeRelationship.SOURCE in node.relationships:
|
341 |
+
all_reference_texts = all_reference_texts + '\n' + node.text
|
342 |
+
wordcloud_r = wordcloud.generate(all_reference_texts)
|
343 |
+
st.image(wordcloud_r.to_array())
|
344 |
+
st.caption('''###### Reference plus graph term frequecy.''')
|
345 |
+
|
346 |
with explainable:
|
347 |
#st.write(answer.source_nodes)
|
348 |
from pyvis.network import Network
|
349 |
graph = Network(height="450px", width="100%")
|
350 |
sources_table = []
|
351 |
+
#all_reference_texts = ''
|
352 |
for nodewithscore in answer.source_nodes:
|
353 |
node = nodewithscore.node
|
354 |
from llama_index.schema import NodeRelationship
|
355 |
if NodeRelationship.SOURCE in node.relationships:
|
|
|
|
|
356 |
node_id = node.relationships[NodeRelationship.SOURCE].node_id
|
357 |
node_id = node_id.split('/')[-1]
|
358 |
title = node_id.split('.')[2].replace('_', ' ')
|
|
|
360 |
link = f'https://arxiv.org/abs/{link}'
|
361 |
href = f'<a target="_blank" href="{link}">{title}</a>'
|
362 |
sources_table.extend([[href, node.text]])
|
363 |
+
#all_reference_texts = all_reference_texts + '\n' + node.text
|
364 |
else:
|
365 |
+
#st.write(node.text) TODO second level relationships
|
|
|
366 |
rel_map = node.metadata['kg_rel_map']
|
367 |
for concept in rel_map.keys():
|
368 |
#st.write(concept)
|
|
|
371 |
for rel in rels:
|
372 |
graph.add_node(rel[1], rel[1], title=rel[1])
|
373 |
graph.add_edge(concept, rel[1], title=rel[0])
|
374 |
+
# --- display the query terms graph
|
375 |
st.session_state.graph_name = 'graph.html'
|
376 |
graph.save_graph(st.session_state.graph_name)
|
377 |
import streamlit.components.v1 as components
|
378 |
graphHtml = open(st.session_state.graph_name, 'r', encoding='utf-8')
|
379 |
source_code = graphHtml.read()
|
|
|
380 |
components.html(source_code, height = 500)
|
381 |
+
# --- display the reference texts table
|
382 |
import pandas as pd
|
383 |
df = pd.DataFrame(sources_table)
|
384 |
df.columns = ['paper', 'relevant text']
|
|
|
388 |
}
|
389 |
</style> """, unsafe_allow_html=True)
|
390 |
st.write(df.to_html(escape=False), unsafe_allow_html=True)
|
391 |
+
# reference text wordcloud
|
392 |
+
#st.session_state.reference_wcloud = all_reference_texts
|
393 |
|
394 |
except Exception as e:
|
395 |
#print(f'{type(e)}, {e}')
|
graph.html
CHANGED
@@ -88,8 +88,8 @@
|
|
88 |
|
89 |
|
90 |
// parsing and collecting nodes and edges from the python
|
91 |
-
nodes = new vis.DataSet([{"color": "#97c2fc", "id": "
|
92 |
-
edges = new vis.DataSet([{"from": "
|
93 |
|
94 |
nodeColors = {};
|
95 |
allNodes = nodes.get({ returnType: "Object" });
|
|
|
88 |
|
89 |
|
90 |
// parsing and collecting nodes and edges from the python
|
91 |
+
nodes = new vis.DataSet([{"color": "#97c2fc", "id": "TKM", "label": "TKM", "shape": "dot", "title": "TKM"}, {"color": "#97c2fc", "id": "decision-making", "label": "decision-making", "shape": "dot", "title": "decision-making"}, {"color": "#97c2fc", "id": "semantic web", "label": "semantic web", "shape": "dot", "title": "semantic web"}, {"color": "#97c2fc", "id": "ontologies", "label": "ontologies", "shape": "dot", "title": "ontologies"}, {"color": "#97c2fc", "id": "data", "label": "data", "shape": "dot", "title": "data"}, {"color": "#97c2fc", "id": "deep learning models", "label": "deep learning models", "shape": "dot", "title": "deep learning models"}, {"color": "#97c2fc", "id": "patient outcomes prediction", "label": "patient outcomes prediction", "shape": "dot", "title": "patient outcomes prediction"}, {"color": "#97c2fc", "id": "post-discharge readmissions", "label": "post-discharge readmissions", "shape": "dot", "title": "post-discharge readmissions"}, {"color": "#97c2fc", "id": "information", "label": "information", "shape": "dot", "title": "information"}, {"color": "#97c2fc", "id": "tabular and unstructured", "label": "tabular and unstructured", "shape": "dot", "title": "tabular and unstructured"}, {"color": "#97c2fc", "id": "enrich", "label": "enrich", "shape": "dot", "title": "enrich"}, {"color": "#97c2fc", "id": "BERT base tokenizer", "label": "BERT base tokenizer", "shape": "dot", "title": "BERT base tokenizer"}, {"color": "#97c2fc", "id": "BERT variant models", "label": "BERT variant models", "shape": "dot", "title": "BERT variant models"}, {"color": "#97c2fc", "id": "BERT variants in medical domain", "label": "BERT variants in medical domain", "shape": "dot", "title": "BERT variants in medical domain"}, {"color": "#97c2fc", "id": "approach", "label": "approach", "shape": "dot", "title": "approach"}, {"color": "#97c2fc", "id": "biomedical", "label": "biomedical", "shape": "dot", "title": "biomedical"}, {"color": "#97c2fc", "id": "multimodal", "label": "multimodal", "shape": "dot", "title": "multimodal"}, {"color": "#97c2fc", "id": "instruction", "label": "instruction", "shape": "dot", "title": "instruction"}, {"color": "#97c2fc", "id": "responses", "label": "responses", "shape": "dot", "title": "responses"}, {"color": "#97c2fc", "id": "visual", "label": "visual", "shape": "dot", "title": "visual"}, {"color": "#97c2fc", "id": "chat", "label": "chat", "shape": "dot", "title": "chat"}, {"color": "#97c2fc", "id": "medical", "label": "medical", "shape": "dot", "title": "medical"}, {"color": "#97c2fc", "id": "alignment", "label": "alignment", "shape": "dot", "title": "alignment"}, {"color": "#97c2fc", "id": "tuning", "label": "tuning", "shape": "dot", "title": "tuning"}, {"color": "#97c2fc", "id": "epochs", "label": "epochs", "shape": "dot", "title": "epochs"}, {"color": "#97c2fc", "id": "samples", "label": "samples", "shape": "dot", "title": "samples"}, {"color": "#97c2fc", "id": "hours", "label": "hours", "shape": "dot", "title": "hours"}, {"color": "#97c2fc", "id": "downstrea", "label": "downstrea", "shape": "dot", "title": "downstrea"}, {"color": "#97c2fc", "id": "in HKGs", "label": "in HKGs", "shape": "dot", "title": "in HKGs"}, {"color": "#97c2fc", "id": "HKGs", "label": "HKGs", "shape": "dot", "title": "HKGs"}, {"color": "#97c2fc", "id": "medical concepts", "label": "medical concepts", "shape": "dot", "title": "medical concepts"}, {"color": "#97c2fc", "id": "medical literature", "label": "medical literature", "shape": "dot", "title": "medical literature"}, {"color": "#97c2fc", "id": "clinical trials", "label": "clinical trials", "shape": "dot", "title": "clinical trials"}, {"color": "#97c2fc", "id": "patientgenerated data", "label": "patientgenerated data", "shape": "dot", "title": "patientgenerated data"}, {"color": "#97c2fc", "id": "medical entities", "label": "medical entities", "shape": "dot", "title": "medical entities"}, {"color": "#97c2fc", "id": "relationships", "label": "relationships", "shape": "dot", "title": "relationships"}, {"color": "#97c2fc", "id": "to structured format", "label": "to structured format", "shape": "dot", "title": "to structured format"}, {"color": "#97c2fc", "id": "entities and relationships", "label": "entities and relationships", "shape": "dot", "title": "entities and relationships"}, {"color": "#97c2fc", "id": "to chosen ontologies", "label": "to chosen ontologies", "shape": "dot", "title": "to chosen ontologies"}, {"color": "#97c2fc", "id": "PubMed", "label": "PubMed", "shape": "dot", "title": "PubMed"}, {"color": "#97c2fc", "id": "navigational queries", "label": "navigational queries", "shape": "dot", "title": "navigational queries"}, {"color": "#97c2fc", "id": "Field Sensor", "label": "Field Sensor", "shape": "dot", "title": "Field Sensor"}, {"color": "#97c2fc", "id": "removed articles", "label": "removed articles", "shape": "dot", "title": "removed articles"}, {"color": "#97c2fc", "id": "articles without titles or abstracts", "label": "articles without titles or abstracts", "shape": "dot", "title": "articles without titles or abstracts"}, {"color": "#97c2fc", "id": "predictions", "label": "predictions", "shape": "dot", "title": "predictions"}, {"color": "#97c2fc", "id": "classification", "label": "classification", "shape": "dot", "title": "classification"}, {"color": "#97c2fc", "id": "framing", "label": "framing", "shape": "dot", "title": "framing"}, {"color": "#97c2fc", "id": "records", "label": "records", "shape": "dot", "title": "records"}, {"color": "#97c2fc", "id": "features", "label": "features", "shape": "dot", "title": "features"}, {"color": "#97c2fc", "id": "utilizing", "label": "utilizing", "shape": "dot", "title": "utilizing"}, {"color": "#97c2fc", "id": "new paradigm", "label": "new paradigm", "shape": "dot", "title": "new paradigm"}, {"color": "#97c2fc", "id": "NLEKMD", "label": "NLEKMD", "shape": "dot", "title": "NLEKMD"}, {"color": "#97c2fc", "id": "making", "label": "making", "shape": "dot", "title": "making"}, {"color": "#97c2fc", "id": "difficult", "label": "difficult", "shape": "dot", "title": "difficult"}, {"color": "#97c2fc", "id": "widely used", "label": "widely used", "shape": "dot", "title": "widely used"}, {"color": "#97c2fc", "id": "decision", "label": "decision", "shape": "dot", "title": "decision"}, {"color": "#97c2fc", "id": "clinicians", "label": "clinicians", "shape": "dot", "title": "clinicians"}, {"color": "#97c2fc", "id": "remote", "label": "remote", "shape": "dot", "title": "remote"}]);
|
92 |
+
edges = new vis.DataSet([{"from": "TKM", "title": "is", "to": "decision-making"}, {"from": "TKM", "title": "is", "to": "semantic web"}, {"from": "TKM", "title": "is", "to": "ontologies"}, {"from": "TKM", "title": "implicit", "to": "data"}, {"from": "TKM", "title": "limitation", "to": "deep learning models"}, {"from": "TKM", "title": "shortage", "to": "patient outcomes prediction"}, {"from": "TKM", "title": "imbalance", "to": "post-discharge readmissions"}, {"from": "TKM", "title": "contains", "to": "information"}, {"from": "TKM", "title": "mix", "to": "tabular and unstructured"}, {"from": "TKM", "title": "and", "to": "enrich"}, {"from": "TKM", "title": "tokenized using", "to": "BERT base tokenizer"}, {"from": "TKM", "title": "IOB tagging scheme", "to": "BERT variant models"}, {"from": "TKM", "title": "CRF layer as tag encoding", "to": "BERT variants in medical domain"}, {"from": "TKM", "title": "augmentation", "to": "approach"}, {"from": "TKM", "title": "statistics", "to": "biomedical"}, {"from": "TKM", "title": "statistics", "to": "multimodal"}, {"from": "TKM", "title": "statistics", "to": "instruction"}, {"from": "TKM", "title": "statistics", "to": "responses"}, {"from": "TKM", "title": "statistics", "to": "visual"}, {"from": "TKM", "title": "statistics", "to": "chat"}, {"from": "TKM", "title": "statistics", "to": "medical"}, {"from": "TKM", "title": "statistics", "to": "alignment"}, {"from": "TKM", "title": "statistics", "to": "tuning"}, {"from": "TKM", "title": "statistics", "to": "epochs"}, {"from": "TKM", "title": "statistics", "to": "samples"}, {"from": "TKM", "title": "statistics", "to": "hours"}, {"from": "TKM", "title": "statistics", "to": "downstrea"}, {"from": "TKM", "title": "heterogeneity", "to": "in HKGs"}, {"from": "TKM", "title": "in", "to": "HKGs"}, {"from": "TKM", "title": "in", "to": "medical concepts"}, {"from": "TKM", "title": "gather", "to": "medical literature"}, {"from": "TKM", "title": "gather", "to": "clinical trials"}, {"from": "TKM", "title": "gather", "to": "patientgenerated data"}, {"from": "TKM", "title": "extract", "to": "medical entities"}, {"from": "TKM", "title": "extract", "to": "relationships"}, {"from": "TKM", "title": "transform", "to": "to structured format"}, {"from": "TKM", "title": "map", "to": "entities and relationships"}, {"from": "TKM", "title": "map", "to": "to chosen ontologies"}, {"from": "TKM", "title": "collection", "to": "PubMed"}, {"from": "TKM", "title": "collection", "to": "navigational queries"}, {"from": "TKM", "title": "collection", "to": "Field Sensor"}, {"from": "TKM", "title": "collection", "to": "removed articles"}, {"from": "TKM", "title": "collection", "to": "articles without titles or abstracts"}, {"from": "TKM", "title": "patterns", "to": "predictions"}, {"from": "TKM", "title": "algorithms", "to": "classification"}, {"from": "TKM", "title": "problem", "to": "framing"}, {"from": "TKM", "title": "medical", "to": "records"}, {"from": "TKM", "title": "key", "to": "features"}, {"from": "TKM", "title": "models", "to": "utilizing"}, {"from": "TKM", "title": "could benefit from", "to": "new paradigm"}, {"from": "TKM", "title": "near-passing performance", "to": "NLEKMD"}, {"from": "making", "title": "it", "to": "difficult"}, {"from": "making", "title": "valuable", "to": "widely used"}, {"from": "clinicians", "title": "local", "to": "remote"}]);
|
93 |
|
94 |
nodeColors = {};
|
95 |
allNodes = nodes.get({ returnType: "Object" });
|