|
|
|
from typing import List, Set |
|
from collections import namedtuple |
|
import random |
|
import requests |
|
import json |
|
import re |
|
|
|
from datetime import datetime as dt |
|
from codetiming import Timer |
|
import streamlit as st |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from matplotlib import pyplot as plt |
|
|
|
from digestor import Digestor |
|
from source import Source |
|
from scrape_sources import NPRLite, CNNText, stub |
|
|
|
|
|
|
|
def initialize(limit, rando, use_cache=True): |
|
clusters: dict[str:List[namedtuple]] = dict() |
|
|
|
|
|
sources:List[Source]= [] |
|
|
|
|
|
sources.append(NPRLite( |
|
'npr', |
|
'https://text.npr.org/1001', |
|
'sshleifer/distilbart-cnn-12-6', |
|
|
|
'dbmdz/bert-large-cased-finetuned-conll03-english' |
|
)) |
|
sources.append(CNNText( |
|
'cnn', |
|
'https://lite.cnn.com', |
|
'sshleifer/distilbart-cnn-12-6', |
|
|
|
'dbmdz/bert-large-cased-finetuned-conll03-english' |
|
)) |
|
|
|
|
|
|
|
cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])] |
|
article_dict : dict[str:namedtuple] |
|
|
|
|
|
|
|
|
|
cluster_data = [] |
|
article_meta = namedtuple('article_meta',['source', 'count']) |
|
cluster_meta : List[article_meta] = [] |
|
for data_source in sources: |
|
if limit is not None: |
|
|
|
c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources)) |
|
else: |
|
c_data, c_meta = data_source.retrieve_cluster_data() |
|
cluster_data.append(c_data) |
|
cluster_meta.append(article_meta(data_source.source_name, c_meta)) |
|
st.session_state[data_source.source_name] = f"Number of articles from source: {c_meta}" |
|
|
|
cluster_data = cluster_data[0] + cluster_data[1] |
|
|
|
|
|
for tup in cluster_data: |
|
|
|
|
|
|
|
perform_ner(tup, cache=use_cache) |
|
generate_clusters(clusters, tup) |
|
st.session_state['num_clusters'] = f"""Total number of clusters: {len(clusters)}""" |
|
|
|
|
|
|
|
|
|
article_dict = {stub.hed: stub for stub in cluster_data} |
|
|
|
|
|
return article_dict, clusters |
|
|
|
|
|
|
|
def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True): |
|
with Timer(name="ner_query_time", logger=None): |
|
result = ner_results(ner_query( |
|
{ |
|
"inputs":tup.hed, |
|
"paramters": |
|
{ |
|
"use_cache": cache, |
|
}, |
|
} |
|
)) |
|
for i in result: |
|
tup.entities.append(i) |
|
|
|
|
|
def ner_query(payload): |
|
data = json.dumps(payload) |
|
response = requests.request("POST", NER_API_URL, headers=headers, data=data) |
|
return json.loads(response.content.decode("utf-8")) |
|
|
|
|
|
|
|
def generate_clusters( |
|
the_dict: dict, |
|
tup : namedtuple('article_stub',[ 'link','hed','entities', 'source']) |
|
) -> dict: |
|
for entity in tup.entities: |
|
|
|
if entity not in the_dict: |
|
the_dict[entity] = [] |
|
|
|
the_dict[entity].append(tup) |
|
|
|
|
|
def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]: |
|
|
|
people, places, orgs, misc = [], [], [], [] |
|
|
|
|
|
|
|
ent = 'entity' if not groups else 'entity_group' |
|
designation = 'I-' if not groups else '' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
actions = {designation+'PER':people.append, |
|
designation+'LOC':places.append, |
|
designation+'ORG':orgs.append, |
|
designation+'MISC':misc.append |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ] |
|
|
|
|
|
ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2] |
|
|
|
return ner_list |
|
|
|
def show_length_graph(): |
|
labels = [i for i in range(outdata['article_count'])] |
|
original_length = [outdata['summaries'][i]['original_length'] for i in outdata['summaries']] |
|
summarized_length = [outdata['summaries'][i]['summary_length'] for i in outdata['summaries']] |
|
x = np.arange(len(labels)) |
|
width = 0.35 |
|
|
|
fig, ax = plt.subplots(figsize=(14,8)) |
|
rects1 = ax.bar(x - width/2, original_length, width, color='lightgreen',zorder=0) |
|
rects2 = ax.bar(x + width/2, summarized_length, width, color='lightblue',zorder=0) |
|
|
|
rects3 = ax.bar(x - width/2, original_length, width, color='none',edgecolor='black', lw=1.25,zorder=1) |
|
rects4 = ax.bar(x + width/2, summarized_length, width, color='none',edgecolor='black', lw=1.25,zorder=1) |
|
|
|
|
|
ax.set_ylabel('Text Length') |
|
ax.set_xticks(x) |
|
ax.set_yticks([i for i in range(0,max(original_length),max(summarized_length))]) |
|
ax.set_xticklabels(labels) |
|
ax.set_xlabel('Article') |
|
|
|
plt.title('Original to Summarized Lengths in Space-Separated Tokens') |
|
|
|
st.pyplot(fig) |
|
|
|
def check_for_word_and_word(in_string): |
|
m = re.search(r'(\w\w+)\sand\s\1', in_string) |
|
if m is not None: |
|
return m.group() |
|
return None |
|
|
|
|
|
|
|
|
|
NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english" |
|
headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""} |
|
|
|
LIMIT = 30 |
|
USE_CACHE = True |
|
|
|
if not USE_CACHE: |
|
print("NOT USING CACHE") |
|
if LIMIT is not None: |
|
print(f"LIMIT: {LIMIT}") |
|
|
|
|
|
digests = dict() |
|
out_dicts = [] |
|
|
|
|
|
|
|
print("Initializing....") |
|
article_dict, clusters = initialize(LIMIT, USE_CACHE) |
|
|
|
|
|
|
|
st.title("Welcome to TopicDig!") |
|
st.success(f"You select the topics, we summarize the relevant news and show you a digest, plus some info to help contextualize what the machine did.") |
|
st.write(f"On the left you'll find a list of topics recently gleaned from current news headlines. TopicDig lets you assemble digests of these stories using transformers!") |
|
st.warning("Enjoy, and remember, these summaries contain a few kinds of issues, from untruths to missing attribution or topic sentences. For more information on truthfulness in automatic summarization with transformers see https://arxiv.org/abs/2109.07958.") |
|
|
|
st.subheader(f"How it works:") |
|
st.write(f"""Select 1 to 3 topics from the drop down menus and click 'submit' to start generating your digest!""") |
|
|
|
|
|
with st.expander("See extra options"): |
|
st.subheader("Refresh topics: ") |
|
st.write("You may want to refresh the topic lists if the app loaded several hours ago or you get no summary.") |
|
|
|
if st.button("Refresh topics!"): |
|
article_dict, clusters = initialize(LIMIT, USE_CACHE) |
|
st.subheader("Select chunk size: ") |
|
st.write("Smaller chunks means more of the article included in the summary and a longer digest.") |
|
chunk_size = st.select_slider(label="Chunk size", options=[i for i in range(50,801,50)], value=400) |
|
|
|
|
|
|
|
selections = [] |
|
choices = list(clusters.keys()) |
|
choices.insert(0,'None') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.sidebar.subheader("Topics") |
|
st.sidebar.write("Here are the current news topics and the number of articles whose headlines featured those topics.") |
|
show_clusters = {i:len(clusters[i]) for i in clusters.keys()} |
|
cdf = pd.DataFrame(data={"Cluster":list(show_clusters.keys()), "Articles":list(show_clusters.values())} ).sort_values(by='Articles', ascending=False) |
|
styler = cdf.style.hide_index() |
|
st.sidebar.write(styler.to_html(), unsafe_allow_html=True) |
|
|
|
|
|
|
|
st.session_state['dt'] = dt.now() |
|
|
|
with st.form(key='columns_in_form'): |
|
cols = st.columns(3) |
|
for i, col in enumerate(cols): |
|
selections.append(col.selectbox(f'Make a Selection', choices, key=i)) |
|
submitted = st.form_submit_button('Submit') |
|
if submitted: |
|
selections = [i for i in selections for j in selections if i is not None] |
|
with st.spinner(text="Creating your digest: this will take a few moments."): |
|
chosen = [] |
|
|
|
for i in selections: |
|
if i != 'None': |
|
for j in clusters[i]: |
|
if j not in chosen: |
|
chosen.append(j) |
|
|
|
|
|
|
|
digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=selections, token_limit=1024, word_limit=chunk_size) |
|
|
|
|
|
st.subheader("What you'll see:") |
|
st.write("First you'll see a list of links appear below. These are the links to the original articles being summarized for your digest, so you can get the full story if you're interested, or check the summary against the source.") |
|
st.write("In a few moments, your machine-generated digest will appear below the links, and below that you'll see an approximate word count of your digest and the time in seconds that the whole process took!") |
|
st.write("You'll also see a graph showing, for each article and summary, the original and summarized lengths.") |
|
st.write("Finally, you will see some possible errors detected in the summaries. This area of NLP is far from perfection and always developing. Hopefully this is an interesting step in the path!") |
|
digestor.digest() |
|
|
|
|
|
|
|
|
|
outdata = digestor.build_digest() |
|
|
|
if len(digestor.text) == 0: |
|
st.write("No text to return...huh.") |
|
else: |
|
st.subheader("Your digest:") |
|
st.info(digestor.text) |
|
|
|
st.subheader("Summarization stats:") |
|
col1, col2, col3 = st.columns(3) |
|
col1.metric("Digest Time", f"""{digestor.timer.timers['digest_time']:.2f}""", "seconds") |
|
col2.metric("Digest Length", str(len(digestor.text.split(" "))), 'space-sep tokens' ) |
|
col3.metric("Article Count", str(outdata['article_count']), "articles" ) |
|
|
|
st.write("Length reduction:") |
|
|
|
show_length_graph() |
|
|
|
text = """ Amara Walker: Christina Yuna Lee, 35, was brutally stabbed dozens of times in her apartment . Walker: It's the same feeling I felt when a man went on a shooting spree in Atlanta-area spas last year, murdering eight people in two counties, six of them Asian women . Walker says Lee's suspected killer is homeless and homeless . Recent NYPD data shows a 361% rise in anti-Asian hate crimes from 2020 to 2021 . 65% of hate incidents are reported by Asian American women, according to Stop AAPI Hate, a national coalition . Many Asian women are changing their daily routines, walking down the street in groups, frequently looking behind their backs, or avoiding public transportation . The perceptions that Asian American women are subhuman sex objects or exotic fantasies reduce us to easy prey . Michelle Alyssa Go, 40, was pushed directly in front of an oncoming train at Times Square station in January . Go's attack is a reminder that gender and race are inextricably connected, making us especially vulnerable to violence . Google gives employees one more month of being fully remote before requiring them back in the office at least three days a week . The tech giant will "end the voluntary WFH period" on April 4 that it gave its employees at the start of the coronavirus pandemic in 2020 . The change will apply to "Google's Bay Area offices and several other US locations" |
|
The National Covid-19 Preparedness Plan will require additional funding from Congress . A major new part of the plan includes a new "Test to Treat" initiative that President Joe Biden announced during his State of the Union address Tuesday night . The administration will continue to make hundreds of millions of high-quality masks available to Americans . |
|
Hawaii Gov. David Ige announced that the state's travel quarantine and Safe Travels Hawaii program will end on March 25 . The Safe Travels program applies only to domestic arrivals . International arrivals must still follow federal requirements (see below). The state's mask mandate remains in place for now . Unvaccinated travelers from abroad are no longer allowed, with limited exceptions . Americans are still allowed to travel to Hawaii, regardless of vaccination status . All restrictions on intercounty travel have been lifted, meaning no pre-travel testing or quarantining is needed for travel between the Hawaiian islands . """ |
|
|
|
st.subheader("Issues: ") |
|
st.write("Repetition:") |
|
rep_check = check_for_word_and_word(text) |
|
if rep_check is not None: |
|
st.write(f"Following phrases repeat: {rep_check}") |
|
found_index = text.find(rep_check) |
|
st.write("Sample:") |
|
st.write(f"{text[found_index-30:found_index+30]}") |
|
else: |
|
st.write("No repetition detected.") |
|
|
|
|