Spaces:

Startup-Exchange
/

NewsBro

Sleeping

File size: 5,390 Bytes
from __future__ import unicode_literals, print_function
import json
import os
import nltk
import tensorflow as tf
import tensorflow_hub as hub
from nltk.tokenize import word_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from transformers import pipeline
from spacy.lang.en import English
nltk.download('punkt')

MAX_TOKENS = 880
MIN_WORD_PER_SENTENCE = 15
SUMMARY_MAX_LENGTH = 240
SUMMARY_MIN_LENGTH = 30

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

class Bullet:
    text = ""
    publisher = "NewsBroInc."
    def __init__(self, text, publisher):
        self.text = text
        self.publisher = publisher
    def __str__(self):
        return f"""{self.publisher}: {self.text}"""

class Summary:
    text = ""
    publisher = "NewsBroInc."
    def __init__(self, text, publisher):
        self.text = text
        self.publisher = publisher


def getNumTokens(article):
    return len(word_tokenize(article))

def lexRank(article, sentenceCount):
    # Create a parser for the article text
    parser = PlaintextParser.from_string(article, Tokenizer("english"))

    # Create a LexRank summarizer
    summarizer = LexRankSummarizer()

    # Get the summary
    summary = summarizer(parser.document, sentenceCount)

    summaryText = []
    for sentence in summary:
        summaryText.append(str(sentence))
    return " ".join(summaryText)


def bart(article, maxLength=SUMMARY_MAX_LENGTH, minLength=SUMMARY_MIN_LENGTH):
    return summarizer(article, max_length=maxLength, min_length=minLength, do_sample=False)


def getArticles():
    folder_path = "articles"

    # Get the list of all files in the specified folder
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Filter out only the txt files
    txt_files = [f for f in files if f.endswith(".txt")]

    # Create a dictionary to store the content of each text file
    file_contents = {}

    # Loop through each txt file and read its content
    for txt_file in txt_files:
        file_path = os.path.join(folder_path, txt_file)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            publisher = txt_file[:-4]
            file_contents[publisher] = content

    return file_contents

def summarizeArticle(article):
    numTokens = getNumTokens(article)
    lexRankedArticle = article
    i = 0
    while numTokens > MAX_TOKENS:
        numSentences = MAX_TOKENS / (MIN_WORD_PER_SENTENCE + i)
        lexRankedArticle = lexRank(article, numSentences)
        numTokens = getNumTokens(lexRankedArticle)
        i += 1
    return bart(lexRankedArticle)
    
def getSummarizedArticles():
    articles = getArticles()
    summaries = []
    for article in articles:
        cur = Summary(summarizeArticle(articles[article])[0]['summary_text'], article)
        summaries.append(cur)
    return summaries

def areBulletsSimilar(sentence1, sentence2):

    embeddings1 = embed([sentence1])
    embeddings2 = embed([sentence2])

    similarity = tf.reduce_sum(tf.multiply(embeddings1, embeddings2)).numpy()

    # print(similarity)
    return similarity > 0.5

def getSentencesFromRawText(input_text):
    # Load the English NLP model from spacy
    nlp = English()

    # Process the text using spacy
    doc = nlp(input_text)
    nlp.add_pipe('sentencizer')

    doc = nlp(input_text)
    sentences = [sent.text.strip() for sent in doc.sents]

    return sentences
    
def getAllBullets(summaries):
    allBullets = []
    for summary in summaries:
        publisher = summary.publisher
        curBullets = getSentencesFromRawText(summary.text)
        for bulletText in curBullets:
            allBullets.append(Bullet(bulletText, publisher)) 
    return allBullets


def getFinalClusters(allBullets):
    output = [[allBullets[0]]]
    for i in range(1, len(allBullets)):
        cur = allBullets[i]
        foundSimilarInstance = False
        for i in range (len(output)):
            if areBulletsSimilar(cur.text, output[i][0].text):
                foundSimilarInstance = True
                output[i].append(cur)
                break
        if foundSimilarInstance == False:
            output.append([cur])

    return output

def getFinalOutput(clusters):
    sortedList = sorted(clusters, key=len)
    sortedList.reverse()
    return sortedList[:5]

def getData():
    allSummaries = getSummarizedArticles()
    allBullets =  getAllBullets(allSummaries)
    clusters = getFinalClusters(allBullets)
    finalOutput = (getFinalOutput(clusters))
    data = []
    for element in finalOutput:
        publishers = []
        for subElement in element:
            publishers.append(subElement.publisher)
        headline = {
            'score' : f"""{round((len(set(publishers)) / 31) * 100, 1)}%""",
            'text' : element[0].text,
            'publishers' : list(set(publishers)),
        }
        data.append(headline)
    return data

def sendData():
    data = getData()
    jsonString = json.dumps(data, indent=2)
    print(jsonString)



    
    file_name = 'output.json'
    with open(file_name, 'w') as json_file:
        json.dump(data, json_file, indent=2)

sendData()