Transformer instead of flair and use dependency image for one article as test
Browse files- app.py +46 -26
- dependency-images/article11.txt +1 -0
app.py
CHANGED
|
@@ -23,15 +23,15 @@ import spacy
|
|
| 23 |
from spacy import displacy
|
| 24 |
from spacy_streamlit import visualize_parser
|
| 25 |
|
| 26 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 27 |
from transformers import pipeline
|
| 28 |
import os
|
| 29 |
from transformers_interpret import SequenceClassificationExplainer
|
| 30 |
|
| 31 |
-
|
| 32 |
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
| 33 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 34 |
|
|
|
|
| 35 |
@st.experimental_singleton
|
| 36 |
def get_sentence_embedding_model():
|
| 37 |
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
|
@@ -43,14 +43,21 @@ def get_spacy():
|
|
| 43 |
return nlp
|
| 44 |
|
| 45 |
|
| 46 |
-
#TODO: might look into which one is the best here
|
| 47 |
-
#TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
|
| 48 |
@st.experimental_singleton
|
| 49 |
-
|
| 50 |
def get_flair_tagger():
|
| 51 |
return SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# Page setup
|
| 55 |
st.set_page_config(
|
| 56 |
page_title="Post-processing summarization fact checker",
|
|
@@ -97,6 +104,12 @@ def fetch_dependency_specific_contents(filename: str) -> AnyStr:
|
|
| 97 |
return data
|
| 98 |
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def display_summary(article_name: str):
|
| 101 |
summary_content = fetch_summary_contents(article_name)
|
| 102 |
st.session_state.summary_output = summary_content
|
|
@@ -122,10 +135,16 @@ def get_all_entities_per_sentence(text):
|
|
| 122 |
entities_this_sentence.append(str(entity))
|
| 123 |
|
| 124 |
# FLAIR ENTITIES
|
| 125 |
-
sentence_entities = Sentence(str(sentence))
|
| 126 |
-
tagger.predict(sentence_entities)
|
| 127 |
-
for entity in sentence_entities.get_spans('ner'):
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
entities_all_sentences.append(entities_this_sentence)
|
| 130 |
|
| 131 |
return entities_all_sentences
|
|
@@ -188,6 +207,7 @@ def highlight_entities(article_name: str):
|
|
| 188 |
def render_dependency_parsing(text: str):
|
| 189 |
html = render_sentence_custom(text)
|
| 190 |
html = html.replace("\n\n", "\n")
|
|
|
|
| 191 |
st.write(get_svg(html), unsafe_allow_html=True)
|
| 192 |
|
| 193 |
|
|
@@ -275,7 +295,8 @@ currently selected article.""")
|
|
| 275 |
|
| 276 |
nlp = get_spacy()
|
| 277 |
sentence_embedding_model = get_sentence_embedding_model()
|
| 278 |
-
tagger = get_flair_tagger()
|
|
|
|
| 279 |
|
| 280 |
# GENERATING SUMMARIES PART
|
| 281 |
st.header("Generating summaries")
|
|
@@ -309,11 +330,6 @@ else:
|
|
| 309 |
st.error('**Error**: No comment to classify. Please provide a comment.',
|
| 310 |
help="Generate summary for the given article text")
|
| 311 |
|
| 312 |
-
if is_valid_url(article_text):
|
| 313 |
-
print("YES")
|
| 314 |
-
else:
|
| 315 |
-
print("NO")
|
| 316 |
-
|
| 317 |
|
| 318 |
def render_svg(svg_file):
|
| 319 |
with open(svg_file, "r") as f:
|
|
@@ -390,17 +406,21 @@ st.markdown("However, by empirical testing, we have found that there are certain
|
|
| 390 |
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
| 391 |
"currently selected article.")
|
| 392 |
with st.spinner("Doing dependency parsing..."):
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
| 405 |
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
| 406 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
|
|
|
| 23 |
from spacy import displacy
|
| 24 |
from spacy_streamlit import visualize_parser
|
| 25 |
|
| 26 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
|
| 27 |
from transformers import pipeline
|
| 28 |
import os
|
| 29 |
from transformers_interpret import SequenceClassificationExplainer
|
| 30 |
|
|
|
|
| 31 |
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
| 32 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 33 |
|
| 34 |
+
|
| 35 |
@st.experimental_singleton
|
| 36 |
def get_sentence_embedding_model():
|
| 37 |
return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
|
| 43 |
return nlp
|
| 44 |
|
| 45 |
|
| 46 |
+
# TODO: might look into which one is the best here
|
| 47 |
+
# TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
|
| 48 |
@st.experimental_singleton
|
| 49 |
+
# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
| 50 |
def get_flair_tagger():
|
| 51 |
return SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
| 52 |
|
| 53 |
|
| 54 |
+
@st.experimental_singleton
|
| 55 |
+
def get_transformer_pipeline():
|
| 56 |
+
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
| 57 |
+
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
| 58 |
+
return pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
# Page setup
|
| 62 |
st.set_page_config(
|
| 63 |
page_title="Post-processing summarization fact checker",
|
|
|
|
| 104 |
return data
|
| 105 |
|
| 106 |
|
| 107 |
+
def fetch_dependency_svg(filename: str) -> AnyStr:
|
| 108 |
+
with open(f'./dependency-images/{filename.lower()}.txt', 'r') as f:
|
| 109 |
+
data = f.read()
|
| 110 |
+
return data
|
| 111 |
+
|
| 112 |
+
|
| 113 |
def display_summary(article_name: str):
|
| 114 |
summary_content = fetch_summary_contents(article_name)
|
| 115 |
st.session_state.summary_output = summary_content
|
|
|
|
| 135 |
entities_this_sentence.append(str(entity))
|
| 136 |
|
| 137 |
# FLAIR ENTITIES
|
| 138 |
+
# sentence_entities = Sentence(str(sentence))
|
| 139 |
+
# tagger.predict(sentence_entities)
|
| 140 |
+
# for entity in sentence_entities.get_spans('ner'):
|
| 141 |
+
# entities_this_sentence.append(entity.text)
|
| 142 |
+
|
| 143 |
+
# XLM ENTITIES
|
| 144 |
+
entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
|
| 145 |
+
for entity in entities_xlm:
|
| 146 |
+
entities_this_sentence.append(str(entity))
|
| 147 |
+
|
| 148 |
entities_all_sentences.append(entities_this_sentence)
|
| 149 |
|
| 150 |
return entities_all_sentences
|
|
|
|
| 207 |
def render_dependency_parsing(text: str):
|
| 208 |
html = render_sentence_custom(text)
|
| 209 |
html = html.replace("\n\n", "\n")
|
| 210 |
+
# print(get_svg(html))
|
| 211 |
st.write(get_svg(html), unsafe_allow_html=True)
|
| 212 |
|
| 213 |
|
|
|
|
| 295 |
|
| 296 |
nlp = get_spacy()
|
| 297 |
sentence_embedding_model = get_sentence_embedding_model()
|
| 298 |
+
# tagger = get_flair_tagger()
|
| 299 |
+
ner_model = get_transformer_pipeline()
|
| 300 |
|
| 301 |
# GENERATING SUMMARIES PART
|
| 302 |
st.header("Generating summaries")
|
|
|
|
| 330 |
st.error('**Error**: No comment to classify. Please provide a comment.',
|
| 331 |
help="Generate summary for the given article text")
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
def render_svg(svg_file):
|
| 335 |
with open(svg_file, "r") as f:
|
|
|
|
| 406 |
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
| 407 |
"currently selected article.")
|
| 408 |
with st.spinner("Doing dependency parsing..."):
|
| 409 |
+
# TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
|
| 410 |
+
if selected_article == 'article11':
|
| 411 |
+
st.write(fetch_dependency_svg((selected_article)), unsafe_allow_html=True)
|
| 412 |
+
else:
|
| 413 |
+
summary_deps = check_dependency(False)
|
| 414 |
+
article_deps = check_dependency(True)
|
| 415 |
+
total_unmatched_deps = []
|
| 416 |
+
for summ_dep in summary_deps:
|
| 417 |
+
if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps):
|
| 418 |
+
total_unmatched_deps.append(summ_dep)
|
| 419 |
+
# print(f'ALL UNMATCHED DEPS ARE: {total_unmatched_deps}')
|
| 420 |
+
# render_dependency_parsing(check_dependency(False))
|
| 421 |
+
if total_unmatched_deps:
|
| 422 |
+
for current_drawing_list in total_unmatched_deps:
|
| 423 |
+
render_dependency_parsing(current_drawing_list)
|
| 424 |
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
| 425 |
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
| 426 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
dependency-images/article11.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem"><img src="" style=""/></div>
|