import nltk |
import re |
import nltkmodule |
from newspaper import Article |
from newspaper import fulltext |
import requests |
from nltk.tokenize import word_tokenize |
from sentence_transformers import SentenceTransformer |
import pandas as pd |
import numpy as np |
from pandas import ExcelWriter |
from torch.utils.data import DataLoader |
import math |
from sentence_transformers import models, losses |
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer |
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator |
from sentence_transformers.readers import * |
from nltk.corpus import stopwords |
stop_words = stopwords.words('english') |
import matplotlib.pyplot as plt |
from sklearn.cluster import KMeans |
from sklearn.decomposition import PCA |
from sklearn.metrics.pairwise import cosine_similarity |
import scipy.spatial |
import networkx as nx |
from nltk.tokenize import sent_tokenize |
import scispacy |
import spacy |
import en_core_sci_lg |
import string |
from nltk.stem.wordnet import WordNetLemmatizer |
import gradio as gr |
nlp = en_core_sci_lg.load() |
sp = en_core_sci_lg.load() |
all_stopwords = sp.Defaults.stop_words |
def remove_stopwords(sen): |
sen_new = " ".join([i for i in sen if i not in stop_words]) |
return sen_new |
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords): |
element=[] |
final_textrank_list=[] |
document=[] |
text_doc=[] |
score_list=[] |
sum_list=[] |
model_1 = SentenceTransformer(model_1) |
model_2 = SentenceTransformer(model_2) |
url = article_link |
html = requests.get(url).text |
article = fulltext(html) |
corpus=sent_tokenize(article) |
indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence', |
'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that', |
'indicated that','suggested that','demonstrated that'] |
count_dict={} |
for l in corpus: |
c=0 |
for l2 in indicator_list: |
if l.find(l2)!=-1: |
c=1 |
break |
if c: |
count_dict[l]=1 |
else: |
count_dict[l]=0 |
for sent, score in count_dict.items(): |
score_list.append(score) |
clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist() |
corpus_embeddings = model_1.encode(clean_sentences_new) |
sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)]) |
for i in range(len(clean_sentences_new)): |
for j in range(len(clean_sentences_new)): |
if i != j: |
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0] |
nx_graph = nx.from_numpy_array(sim_mat) |
scores = nx.pagerank(nx_graph) |
sentences=((scores[i],s) for i,s in enumerate(corpus)) |
for elem in sentences: |
element.append(elem[0]) |
for sc, lst in zip(score_list, element): |
sum1=sc+lst |
sum_list.append(sum1) |
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True) |
for elem in x: |
final_textrank_list.append(elem[1]) |
a=int((10*len(final_textrank_list))/100.0) |
if(a<5): |
total=5 |
else: |
total=int(a) |
for i in range(total): |
document.append(final_textrank_list[i]) |
doc=" ".join(document) |
for i in document: |
doc_1=nlp(i) |
text_doc.append([X.text for X in doc_1.ents]) |
entity_list = [item for sublist in text_doc for item in sublist] |
entity_list = [word for word in entity_list if not word in all_stopwords] |
entity_list=list(dict.fromkeys(entity_list)) |
doc_embedding = model_2.encode([doc]) |
candidates=entity_list |
candidate_embeddings = model_2.encode(candidates) |
distances = cosine_similarity(doc_embedding, candidate_embeddings) |
top_n = max_num_keywords |
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]] |
keywords = '\n'.join(keyword_list) |
return keywords |
igen=gr.Interface(keyphrase_generator, |
inputs=[gr.inputs.Textbox(lines=3, placeholder="Provide article link here", label="Article link"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-mpnet-base-v2", label="SBERT model for TextRank (e.g. all-mpnet-base-v2)"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-distilroberta-v1",label="SBERT model for Keyphrases (e.g. all-distilroberta-v1)"),gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max number of keyphrases to show")], |
outputs="text", theme="huggingface", |
title="Health Article Keyphrase Generator", |
description="Generates the keyphrases from an online health article which best describes the article.", |
article= "The work is based on a part of the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>Unsupervised Keyword Combination Query Generation from Online Health Related Content for Evidence-Based Fact Checking</a>." |
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT." |
"\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>." |
"\t The default model names are provided which can be changed from the list of pretrained models. " |
"\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.") |
igen.launch(share=True) |