NewsBro / controller /newsBroApi.py
Michel Maalouli
add app files
9575051
from __future__ import unicode_literals, print_function
import json
import os
import nltk
import tensorflow as tf
import tensorflow_hub as hub
from nltk.tokenize import word_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from transformers import pipeline
from spacy.lang.en import English
nltk.download('punkt')
MAX_TOKENS = 880
MIN_WORD_PER_SENTENCE = 15
SUMMARY_MAX_LENGTH = 240
SUMMARY_MIN_LENGTH = 30
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
class Bullet:
text = ""
publisher = "NewsBroInc."
def __init__(self, text, publisher):
self.text = text
self.publisher = publisher
def __str__(self):
return f"""{self.publisher}: {self.text}"""
class Summary:
text = ""
publisher = "NewsBroInc."
def __init__(self, text, publisher):
self.text = text
self.publisher = publisher
def getNumTokens(article):
return len(word_tokenize(article))
def lexRank(article, sentenceCount):
# Create a parser for the article text
parser = PlaintextParser.from_string(article, Tokenizer("english"))
# Create a LexRank summarizer
summarizer = LexRankSummarizer()
# Get the summary
summary = summarizer(parser.document, sentenceCount)
summaryText = []
for sentence in summary:
summaryText.append(str(sentence))
return " ".join(summaryText)
def bart(article, maxLength=SUMMARY_MAX_LENGTH, minLength=SUMMARY_MIN_LENGTH):
return summarizer(article, max_length=maxLength, min_length=minLength, do_sample=False)
def getArticles():
folder_path = "articles"
# Get the list of all files in the specified folder
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
# Filter out only the txt files
txt_files = [f for f in files if f.endswith(".txt")]
# Create a dictionary to store the content of each text file
file_contents = {}
# Loop through each txt file and read its content
for txt_file in txt_files:
file_path = os.path.join(folder_path, txt_file)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
publisher = txt_file[:-4]
file_contents[publisher] = content
return file_contents
def summarizeArticle(article):
numTokens = getNumTokens(article)
lexRankedArticle = article
i = 0
while numTokens > MAX_TOKENS:
numSentences = MAX_TOKENS / (MIN_WORD_PER_SENTENCE + i)
lexRankedArticle = lexRank(article, numSentences)
numTokens = getNumTokens(lexRankedArticle)
i += 1
return bart(lexRankedArticle)
def getSummarizedArticles():
articles = getArticles()
summaries = []
for article in articles:
cur = Summary(summarizeArticle(articles[article])[0]['summary_text'], article)
summaries.append(cur)
return summaries
def areBulletsSimilar(sentence1, sentence2):
embeddings1 = embed([sentence1])
embeddings2 = embed([sentence2])
similarity = tf.reduce_sum(tf.multiply(embeddings1, embeddings2)).numpy()
# print(similarity)
return similarity > 0.5
def getSentencesFromRawText(input_text):
# Load the English NLP model from spacy
nlp = English()
# Process the text using spacy
doc = nlp(input_text)
nlp.add_pipe('sentencizer')
doc = nlp(input_text)
sentences = [sent.text.strip() for sent in doc.sents]
return sentences
def getAllBullets(summaries):
allBullets = []
for summary in summaries:
publisher = summary.publisher
curBullets = getSentencesFromRawText(summary.text)
for bulletText in curBullets:
allBullets.append(Bullet(bulletText, publisher))
return allBullets
def getFinalClusters(allBullets):
output = [[allBullets[0]]]
for i in range(1, len(allBullets)):
cur = allBullets[i]
foundSimilarInstance = False
for i in range (len(output)):
if areBulletsSimilar(cur.text, output[i][0].text):
foundSimilarInstance = True
output[i].append(cur)
break
if foundSimilarInstance == False:
output.append([cur])
return output
def getFinalOutput(clusters):
sortedList = sorted(clusters, key=len)
sortedList.reverse()
return sortedList[:5]
def getData():
allSummaries = getSummarizedArticles()
allBullets = getAllBullets(allSummaries)
clusters = getFinalClusters(allBullets)
finalOutput = (getFinalOutput(clusters))
data = []
for element in finalOutput:
publishers = []
for subElement in element:
publishers.append(subElement.publisher)
headline = {
'score' : f"""{round((len(set(publishers)) / 31) * 100, 1)}%""",
'text' : element[0].text,
'publishers' : list(set(publishers)),
}
data.append(headline)
return data
def sendData():
data = getData()
jsonString = json.dumps(data, indent=2)
print(jsonString)
file_name = 'output.json'
with open(file_name, 'w') as json_file:
json.dump(data, json_file, indent=2)
sendData()