from __future__ import unicode_literals, print_function import json import os import nltk import tensorflow as tf import tensorflow_hub as hub from nltk.tokenize import word_tokenize from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer from transformers import pipeline from spacy.lang.en import English nltk.download('punkt') MAX_TOKENS = 880 MIN_WORD_PER_SENTENCE = 15 SUMMARY_MAX_LENGTH = 240 SUMMARY_MIN_LENGTH = 30 embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") summarizer = pipeline("summarization", model="facebook/bart-large-cnn") class Bullet: text = "" publisher = "NewsBroInc." def __init__(self, text, publisher): self.text = text self.publisher = publisher def __str__(self): return f"""{self.publisher}: {self.text}""" class Summary: text = "" publisher = "NewsBroInc." def __init__(self, text, publisher): self.text = text self.publisher = publisher def getNumTokens(article): return len(word_tokenize(article)) def lexRank(article, sentenceCount): # Create a parser for the article text parser = PlaintextParser.from_string(article, Tokenizer("english")) # Create a LexRank summarizer summarizer = LexRankSummarizer() # Get the summary summary = summarizer(parser.document, sentenceCount) summaryText = [] for sentence in summary: summaryText.append(str(sentence)) return " ".join(summaryText) def bart(article, maxLength=SUMMARY_MAX_LENGTH, minLength=SUMMARY_MIN_LENGTH): return summarizer(article, max_length=maxLength, min_length=minLength, do_sample=False) def getArticles(): folder_path = "articles" # Get the list of all files in the specified folder files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))] # Filter out only the txt files txt_files = [f for f in files if f.endswith(".txt")] # Create a dictionary to store the content of each text file file_contents = {} # Loop through each txt file and read its content for txt_file in txt_files: file_path = os.path.join(folder_path, txt_file) with open(file_path, 'r', encoding='utf-8') as file: content = file.read() publisher = txt_file[:-4] file_contents[publisher] = content return file_contents def summarizeArticle(article): numTokens = getNumTokens(article) lexRankedArticle = article i = 0 while numTokens > MAX_TOKENS: numSentences = MAX_TOKENS / (MIN_WORD_PER_SENTENCE + i) lexRankedArticle = lexRank(article, numSentences) numTokens = getNumTokens(lexRankedArticle) i += 1 return bart(lexRankedArticle) def getSummarizedArticles(): articles = getArticles() summaries = [] for article in articles: cur = Summary(summarizeArticle(articles[article])[0]['summary_text'], article) summaries.append(cur) return summaries def areBulletsSimilar(sentence1, sentence2): embeddings1 = embed([sentence1]) embeddings2 = embed([sentence2]) similarity = tf.reduce_sum(tf.multiply(embeddings1, embeddings2)).numpy() # print(similarity) return similarity > 0.5 def getSentencesFromRawText(input_text): # Load the English NLP model from spacy nlp = English() # Process the text using spacy doc = nlp(input_text) nlp.add_pipe('sentencizer') doc = nlp(input_text) sentences = [sent.text.strip() for sent in doc.sents] return sentences def getAllBullets(summaries): allBullets = [] for summary in summaries: publisher = summary.publisher curBullets = getSentencesFromRawText(summary.text) for bulletText in curBullets: allBullets.append(Bullet(bulletText, publisher)) return allBullets def getFinalClusters(allBullets): output = [[allBullets[0]]] for i in range(1, len(allBullets)): cur = allBullets[i] foundSimilarInstance = False for i in range (len(output)): if areBulletsSimilar(cur.text, output[i][0].text): foundSimilarInstance = True output[i].append(cur) break if foundSimilarInstance == False: output.append([cur]) return output def getFinalOutput(clusters): sortedList = sorted(clusters, key=len) sortedList.reverse() return sortedList[:5] def getData(): allSummaries = getSummarizedArticles() allBullets = getAllBullets(allSummaries) clusters = getFinalClusters(allBullets) finalOutput = (getFinalOutput(clusters)) data = [] for element in finalOutput: publishers = [] for subElement in element: publishers.append(subElement.publisher) headline = { 'score' : f"""{round((len(set(publishers)) / 31) * 100, 1)}%""", 'text' : element[0].text, 'publishers' : list(set(publishers)), } data.append(headline) return data def sendData(): data = getData() jsonString = json.dumps(data, indent=2) print(jsonString) file_name = 'output.json' with open(file_name, 'w') as json_file: json.dump(data, json_file, indent=2) sendData()