Spaces:

Startup-Exchange
/

NewsBro

Running

NewsBro / controller /newsBroApi.py

Michel Maalouli

add app files

9575051 over 1 year ago

5.39 kB

	from __future__ import unicode_literals, print_function
	import json
	import os
	import nltk
	import tensorflow as tf
	import tensorflow_hub as hub
	from nltk.tokenize import word_tokenize
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.lex_rank import LexRankSummarizer
	from transformers import pipeline
	from spacy.lang.en import English
	nltk.download('punkt')

	MAX_TOKENS = 880
	MIN_WORD_PER_SENTENCE = 15
	SUMMARY_MAX_LENGTH = 240
	SUMMARY_MIN_LENGTH = 30

	embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	class Bullet:
	text = ""
	publisher = "NewsBroInc."
	def __init__(self, text, publisher):
	self.text = text
	self.publisher = publisher
	def __str__(self):
	return f"""{self.publisher}: {self.text}"""

	class Summary:
	text = ""
	publisher = "NewsBroInc."
	def __init__(self, text, publisher):
	self.text = text
	self.publisher = publisher


	def getNumTokens(article):
	return len(word_tokenize(article))

	def lexRank(article, sentenceCount):
	# Create a parser for the article text
	parser = PlaintextParser.from_string(article, Tokenizer("english"))

	# Create a LexRank summarizer
	summarizer = LexRankSummarizer()

	# Get the summary
	summary = summarizer(parser.document, sentenceCount)

	summaryText = []
	for sentence in summary:
	summaryText.append(str(sentence))
	return " ".join(summaryText)


	def bart(article, maxLength=SUMMARY_MAX_LENGTH, minLength=SUMMARY_MIN_LENGTH):
	return summarizer(article, max_length=maxLength, min_length=minLength, do_sample=False)


	def getArticles():
	folder_path = "articles"

	# Get the list of all files in the specified folder
	files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

	# Filter out only the txt files
	txt_files = [f for f in files if f.endswith(".txt")]

	# Create a dictionary to store the content of each text file
	file_contents = {}

	# Loop through each txt file and read its content
	for txt_file in txt_files:
	file_path = os.path.join(folder_path, txt_file)
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()
	publisher = txt_file[:-4]
	file_contents[publisher] = content

	return file_contents

	def summarizeArticle(article):
	numTokens = getNumTokens(article)
	lexRankedArticle = article
	i = 0
	while numTokens > MAX_TOKENS:
	numSentences = MAX_TOKENS / (MIN_WORD_PER_SENTENCE + i)
	lexRankedArticle = lexRank(article, numSentences)
	numTokens = getNumTokens(lexRankedArticle)
	i += 1
	return bart(lexRankedArticle)

	def getSummarizedArticles():
	articles = getArticles()
	summaries = []
	for article in articles:
	cur = Summary(summarizeArticle(articles[article])[0]['summary_text'], article)
	summaries.append(cur)
	return summaries

	def areBulletsSimilar(sentence1, sentence2):

	embeddings1 = embed([sentence1])
	embeddings2 = embed([sentence2])

	similarity = tf.reduce_sum(tf.multiply(embeddings1, embeddings2)).numpy()

	# print(similarity)
	return similarity > 0.5

	def getSentencesFromRawText(input_text):
	# Load the English NLP model from spacy
	nlp = English()

	# Process the text using spacy
	doc = nlp(input_text)
	nlp.add_pipe('sentencizer')

	doc = nlp(input_text)
	sentences = [sent.text.strip() for sent in doc.sents]

	return sentences

	def getAllBullets(summaries):
	allBullets = []
	for summary in summaries:
	publisher = summary.publisher
	curBullets = getSentencesFromRawText(summary.text)
	for bulletText in curBullets:
	allBullets.append(Bullet(bulletText, publisher))
	return allBullets


	def getFinalClusters(allBullets):
	output = [[allBullets[0]]]
	for i in range(1, len(allBullets)):
	cur = allBullets[i]
	foundSimilarInstance = False
	for i in range (len(output)):
	if areBulletsSimilar(cur.text, output[i][0].text):
	foundSimilarInstance = True
	output[i].append(cur)
	break
	if foundSimilarInstance == False:
	output.append([cur])

	return output

	def getFinalOutput(clusters):
	sortedList = sorted(clusters, key=len)
	sortedList.reverse()
	return sortedList[:5]

	def getData():
	allSummaries = getSummarizedArticles()
	allBullets = getAllBullets(allSummaries)
	clusters = getFinalClusters(allBullets)
	finalOutput = (getFinalOutput(clusters))
	data = []
	for element in finalOutput:
	publishers = []
	for subElement in element:
	publishers.append(subElement.publisher)
	headline = {
	'score' : f"""{round((len(set(publishers)) / 31) * 100, 1)}%""",
	'text' : element[0].text,
	'publishers' : list(set(publishers)),
	}
	data.append(headline)
	return data

	def sendData():
	data = getData()
	jsonString = json.dumps(data, indent=2)
	print(jsonString)




	file_name = 'output.json'
	with open(file_name, 'w') as json_file:
	json.dump(data, json_file, indent=2)

	sendData()