|
import nltk |
|
import re |
|
import nltkmodule |
|
from newspaper import Article |
|
from newspaper import fulltext |
|
import requests |
|
from nltk.tokenize import word_tokenize |
|
from sentence_transformers import SentenceTransformer, models, losses, LoggingHandler |
|
import pandas as pd |
|
import numpy as np |
|
from torch.utils.data import DataLoader |
|
import math |
|
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator |
|
from sentence_transformers.readers import * |
|
from nltk.corpus import stopwords |
|
stop_words = stopwords.words('english') |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import networkx as nx |
|
from nltk.tokenize import sent_tokenize |
|
import scispacy |
|
import en_core_sci_lg |
|
import string |
|
import gradio as gr |
|
import inflect |
|
|
|
inflect_op = inflect.engine() |
|
nlp = en_core_sci_lg.load() |
|
sp = en_core_sci_lg.load() |
|
all_stopwords = sp.Defaults.stop_words |
|
|
|
|
|
def remove_stopwords(sen): |
|
sen_new = " ".join([i for i in sen if i not in stop_words]) |
|
return sen_new |
|
|
|
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords): |
|
element=[] |
|
final_textrank_list=[] |
|
document=[] |
|
text_doc=[] |
|
score_list=[] |
|
sum_list=[] |
|
model_1 = SentenceTransformer(model_1) |
|
model_2 = SentenceTransformer(model_2) |
|
url = article_link |
|
html = requests.get(url).text |
|
article = fulltext(html) |
|
corpus=sent_tokenize(article) |
|
indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence', |
|
'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that', |
|
'indicated that','suggested that','demonstrated that'] |
|
count_dict={} |
|
for l in corpus: |
|
c=0 |
|
for l2 in indicator_list: |
|
if l.find(l2)!=-1: |
|
c=1 |
|
break |
|
if c: |
|
count_dict[l]=1 |
|
else: |
|
count_dict[l]=0 |
|
for sent, score in count_dict.items(): |
|
score_list.append(score) |
|
clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ", regex=True).tolist() |
|
corpus_embeddings = model_1.encode(clean_sentences_new) |
|
sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)]) |
|
for i in range(len(clean_sentences_new)): |
|
len_embeddings=(len(corpus_embeddings[i])) |
|
for j in range(len(clean_sentences_new)): |
|
if i != j: |
|
if(len_embeddings == 1024): |
|
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,1024), corpus_embeddings[j].reshape(1,1024))[0,0] |
|
elif(len_embeddings == 768): |
|
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0] |
|
nx_graph = nx.from_numpy_array(sim_mat) |
|
scores = nx.pagerank(nx_graph) |
|
sentences=((scores[i],s) for i,s in enumerate(corpus)) |
|
|
|
for elem in sentences: |
|
element.append(elem[0]) |
|
for sc, lst in zip(score_list, element): |
|
sum1=sc+lst |
|
sum_list.append(sum1) |
|
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True) |
|
for elem in x: |
|
final_textrank_list.append(elem[1]) |
|
a=int((10*len(final_textrank_list))/100.0) |
|
if(a<5): |
|
total=5 |
|
else: |
|
total=int(a) |
|
for i in range(total): |
|
document.append(final_textrank_list[i]) |
|
doc=" ".join(document) |
|
for i in document: |
|
doc_1=nlp(i) |
|
text_doc.append([X.text for X in doc_1.ents]) |
|
entity_list = [item for sublist in text_doc for item in sublist] |
|
entity_list = [word for word in entity_list if not word in all_stopwords] |
|
entity_list = [word_entity for word_entity in entity_list if(inflect_op.singular_noun(word_entity) == False)] |
|
entity_list=list(dict.fromkeys(entity_list)) |
|
doc_embedding = model_2.encode([doc]) |
|
candidates=entity_list |
|
candidate_embeddings = model_2.encode(candidates) |
|
distances = cosine_similarity(doc_embedding, candidate_embeddings) |
|
top_n = max_num_keywords |
|
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]] |
|
keywords = '\n'.join(keyword_list) |
|
return keywords |
|
|
|
igen=gr.Interface(keyphrase_generator, |
|
inputs=[gr.components.Textbox(lines=1, placeholder="Provide an online health article web link here",default="", label="Article web link"), |
|
gr.components.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2', |
|
'sentence-transformers/all-mpnet-base-v1', |
|
'sentence-transformers/all-distilroberta-v1', |
|
'sentence-transformers/gtr-t5-large', |
|
'pritamdeka/S-Bluebert-snli-multinli-stsb', |
|
'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb', |
|
'sentence-transformers/stsb-mpnet-base-v2', |
|
'sentence-transformers/all-roberta-large-v1', |
|
'sentence-transformers/stsb-roberta-base-v2', |
|
'sentence-transformers/stsb-distilroberta-base-v2', |
|
'sentence-transformers/sentence-t5-large', |
|
'sentence-transformers/sentence-t5-base'], |
|
type="value", |
|
default='pritamdeka/S-Biomed-Roberta-snli-multinli-stsb', |
|
label="Select any SBERT model for TextRank from the list below"), |
|
gr.components.Dropdown(choices=['sentence-transformers/paraphrase-mpnet-base-v2', |
|
'sentence-transformers/all-mpnet-base-v1', |
|
'sentence-transformers/paraphrase-distilroberta-base-v1', |
|
'sentence-transformers/paraphrase-xlm-r-multilingual-v1', |
|
'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', |
|
'sentence-transformers/paraphrase-albert-small-v2', |
|
'sentence-transformers/paraphrase-albert-base-v2', |
|
'sentence-transformers/paraphrase-MiniLM-L12-v2', |
|
'sentence-transformers/paraphrase-MiniLM-L6-v2', |
|
'sentence-transformers/all-MiniLM-L12-v2', |
|
'sentence-transformers/all-distilroberta-v1', |
|
'sentence-transformers/paraphrase-TinyBERT-L6-v2', |
|
'sentence-transformers/paraphrase-MiniLM-L3-v2', |
|
'sentence-transformers/all-MiniLM-L6-v2'], |
|
type="value", |
|
default='sentence-transformers/all-mpnet-base-v1', |
|
label="Select any SBERT model for keyphrases from the list below"), |
|
gr.components.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")], |
|
outputs=gr.outputs.Textbox(type="text", label="Output"), theme="peach", |
|
title="Health Article Keyphrase Generator", |
|
description="Generates the keyphrases from an online health article which best describes the article. ", |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
article= "The work is based on a part of the paper provided <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>here</a>." |
|
"\t It uses the TextRank algorithm with <a href=https://www.sbert.net/>SBERT</a> to first find the top ranked sentences and then extracts the keyphrases" |
|
"\t from those sentences using <a href = https://allenai.github.io/scispacy/>scispaCy</a> and SBERT." |
|
"\t The list of SBERT models provided can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>." |
|
"\t The default model names are provided which can be changed from the list of models available. " |
|
"\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.") |
|
|
|
igen.launch(share=False) |
|
|