Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- model_weights/best_lda_model.joblib +3 -0
- serviceaccount/gsheets-upload-403705-efeef293c71f.json +13 -0
- src/__pycache__/data_preprocessing.cpython-311.pyc +0 -0
- src/__pycache__/data_retrieval.cpython-311.pyc +0 -0
- src/__pycache__/gsheets.cpython-311.pyc +0 -0
- src/__pycache__/summarizer.cpython-311.pyc +0 -0
- src/data_preprocessing.py +122 -0
- src/data_retrieval.py +114 -0
- src/gsheets.py +33 -0
- src/summarizer.py +78 -0
model_weights/best_lda_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62cfd988597b1f33b832b416cb6741efe3defc297901a5928a3e7ce341153cc6
|
3 |
+
size 59652
|
serviceaccount/gsheets-upload-403705-efeef293c71f.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "service_account",
|
3 |
+
"project_id": "gsheets-upload-403705",
|
4 |
+
"private_key_id": "efeef293c71f53fab4f01876a906ba0d41c9c3b2",
|
5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDBAA0A3H3heIQU\no2nE3DeZZduUZ/x5Xft7LZ+wLlOXecYkm9Fhs3uo5v4ljljZOcsycCQ8q0UN/iIV\n9/zSaXevEGAwGLFA/rATdS4bLVf+60eCvMS4J7zBqpbJOMk3lzgkMkcIqYMEf0wb\nN0itqFS39WLFi3E7YyCGutkfkjdRiHBhJvryvZo9IBnNGUR59BzyXY5euIChpTwC\nq81bgK1oUgoK5xuPmc0JNVGVQDJwRRCWauR3z0o+IdYTxa/RMP+I0tl+wIYhoWP+\n2Rvqnjg8MvBwb+aB6uuuKoHUY3Id5Kx/ALFtp9ad+HoU8bYDnLcIiJzHgSNt9zdb\nVKggS4Q1AgMBAAECggEAAX6EqXn9UCvYTQxhuTIP8FtfIo4PUoHkfJh7/y6xFKcW\nM6+UCD84pyYNyYz7TTnnYDgXfRAXDvEszWH2Lmf2vYafiq/I64e7euoyKwTrPYsP\nxw+CTGLq0SPqk37McVg8wk/e5JWNm7uxrfsmpYH8LXtQl523eqbyplopM2WmkW8u\nr1J9MX6hCV5IC77EueovFDHmaAnf72Ud+JLcvFImzi0hd7dxU9sLMcg7uItCF6Fl\njh+mZGA2jJQxNQUT4C4tSRCXkm6C8pSN3WLdcX4I/X/LA6QUanrg9pbu379BH7wp\nvjhUSvZeG9+NJXWuQGzamNrKLIPZJBEzB0zREVQaDwKBgQD7YOp3BnwPfzO6L6Lq\napl/hNKeBf5pCb/0QL0Hq9UoItBJqX4Pep0dJrbMZaesApG2+RsaN5tz+GLm7SzY\n3IH9Sq2F4HaM5JCKlwmye5wHFECqSs7ktpioBaljaEpYuil4wRbzWHnuP/4fjjUo\nU0knVzWIzTzn7GoJsguY0mNX/wKBgQDEjGJDoTmCcj/c3mPPagVg1OaaRMu1WfqC\nv78Tr51ivbRisktz+smlksjiBbOBaUcA8dTXcKRGqp3wP8PAJi+9pJodl5NcjzGI\nO2gQ/HW9y1wb/qzyfNQETEDOz0Ke9XvWPGetAV13kGhVlmFuxiotbPW5FwrcKXLz\nKZoMvINDywKBgQCaXOUdugmsqnvlNSNht5wSxklfaGbVsXsCTk7FyyrVvqsQ0Nfs\nQWsBX6iY00OnSNyZ81ZFPyhiioCRNct4T9Ay7gyoTTH/SsvHjwARbf5eCn27FLz/\njXEonHFr7brZyVd2I3woaohVWU5/qh/SZ3JgihkBrKZd9LsYwRCGA4ulmQKBgBkI\ndD9+2k9F8+JSpM23CCZUF2bQmk1nv2NFvrVoKZh45u+nG7sS1vnynwlChqFV4kg5\nhM1HuHSTqHf/9xOTCYOS4loggxFH35wlTNTVAr4Al6OtJSPhSDOf7qUoeqi6RWJ8\n4QuE3/2pc9BqzdAJBzgv54ACckymLtDPnKJApEtPAoGAewTD9XtIRL9FxN6i3JSJ\nafALdnZ5ykrN5V2II95x+2P1hHj1mlzDdQ6LOmOOlsivN4JV3HynYLeITLmNb9HC\no7JAKnUxY014knDzhypQvAtv0p3NcE9j2rkdqAVjx5mtfkQv095cqXc5AwSGjsIh\nI30uIg5qhqTkmqTVTcRwUpM=\n-----END PRIVATE KEY-----\n",
|
6 |
+
"client_email": "[email protected]",
|
7 |
+
"client_id": "102153694664172854211",
|
8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gsheets-uploader%40gsheets-upload-403705.iam.gserviceaccount.com",
|
12 |
+
"universe_domain": "googleapis.com"
|
13 |
+
}
|
src/__pycache__/data_preprocessing.cpython-311.pyc
ADDED
Binary file (7.49 kB). View file
|
|
src/__pycache__/data_retrieval.cpython-311.pyc
ADDED
Binary file (6.55 kB). View file
|
|
src/__pycache__/gsheets.cpython-311.pyc
ADDED
Binary file (2.2 kB). View file
|
|
src/__pycache__/summarizer.cpython-311.pyc
ADDED
Binary file (3.12 kB). View file
|
|
src/data_preprocessing.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import nltk
|
3 |
+
nltk.download('punkt')
|
4 |
+
import re
|
5 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
6 |
+
from joblib import load
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
# import seaborn as sns
|
11 |
+
# import matplotlib.pyplot as plt
|
12 |
+
|
13 |
+
|
14 |
+
nltk.download('stopwords')
|
15 |
+
stop_words = set(nltk.corpus.stopwords.words('english'))
|
16 |
+
|
17 |
+
def tokenize(text):
|
18 |
+
wordstoremove = ['Thomas', 'thing', 'quite', 'exist', 'live', 'things', 'you\'re', 'we\'ll', 'really', 'right',
|
19 |
+
'said', 'right', 'refresh', 'realized', 'realize', 'wrong', 'means', 'stuff', 'wants', 'like',
|
20 |
+
'going', 'exactly', 'feel', 'probably', 'likely', 'likes', 'thank', 'oopsie', 'rightfully', 'paul', '23andme', 'didn', 'know', 'just', 'really', 'able', 'actually', 'comes', 'does', 'left']
|
21 |
+
tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) and word not in wordstoremove ]
|
22 |
+
tokens = map(str.lower, tokens)
|
23 |
+
return tokens
|
24 |
+
|
25 |
+
def lda(input_file):
|
26 |
+
|
27 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
28 |
+
data = json.load(f)
|
29 |
+
|
30 |
+
df = pd.DataFrame(columns=["title", "url", "source", "text"])
|
31 |
+
|
32 |
+
dfs_to_concat = []
|
33 |
+
for source, articles in data.items():
|
34 |
+
for article in articles:
|
35 |
+
new_df = pd.DataFrame({
|
36 |
+
"title": [article["title"]],
|
37 |
+
"url": [article["url"]],
|
38 |
+
"source": [source],
|
39 |
+
"text": [article["text"]]
|
40 |
+
})
|
41 |
+
|
42 |
+
dfs_to_concat.append(new_df)
|
43 |
+
df = pd.concat([df] + dfs_to_concat, ignore_index=True)
|
44 |
+
|
45 |
+
|
46 |
+
vectorizer_count = CountVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.50, max_features=500, lowercase=False, ngram_range=(1,2))
|
47 |
+
countidf_vectors = vectorizer_count.fit_transform(df.text)
|
48 |
+
|
49 |
+
feature_names = vectorizer_count.get_feature_names_out()
|
50 |
+
|
51 |
+
lda_model = load('model_weights/best_lda_model.joblib')
|
52 |
+
W1 = lda_model.fit_transform(countidf_vectors)
|
53 |
+
H1 = lda_model.components_
|
54 |
+
|
55 |
+
|
56 |
+
num_words=15
|
57 |
+
|
58 |
+
vocab = np.array(feature_names)
|
59 |
+
|
60 |
+
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
|
61 |
+
topic_words = ([top_words(t) for t in H1])
|
62 |
+
topics = [' '.join(t) for t in topic_words]
|
63 |
+
topics_str = '\n\n'.join(topics)
|
64 |
+
|
65 |
+
histo, barchart = visualize(topics, df, W1, H1, lda_model, vectorizer_count)
|
66 |
+
print("done")
|
67 |
+
return topics_str, histo, barchart
|
68 |
+
|
69 |
+
def visualize(topics, df, W1, H1, lda_model, vectorizer):
|
70 |
+
#label each document with a topic
|
71 |
+
colnames = ["Topic" + str(i+1) for i in range(lda_model.n_components)]
|
72 |
+
docnames = df['title']
|
73 |
+
|
74 |
+
df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index=docnames)
|
75 |
+
significant_topic = np.argmax(df_doc_topic.values, axis=1)
|
76 |
+
|
77 |
+
#histogram of common topics
|
78 |
+
df_doc_topic['dominant_topic'] = significant_topic + 1
|
79 |
+
histogram_fig, histogram_ax = plt.subplots()
|
80 |
+
df_doc_topic['dominant_topic'].hist(bins=7, ax=histogram_ax)
|
81 |
+
histogram_ax.set_title('Histogram of Dominant Topics')
|
82 |
+
|
83 |
+
#words of each topic
|
84 |
+
fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
|
85 |
+
axes = axes.flatten()
|
86 |
+
for topic_idx, topic in enumerate(lda_model.components_):
|
87 |
+
top_features_ind = topic.argsort()[:-10 - 1:-1]
|
88 |
+
top_features = [vectorizer.get_feature_names_out()[i] for i in top_features_ind]
|
89 |
+
weights = topic[top_features_ind]
|
90 |
+
|
91 |
+
ax = axes[topic_idx]
|
92 |
+
ax.barh(top_features, weights, height=0.7)
|
93 |
+
ax.set_title(f'Topic {topic_idx +1}')
|
94 |
+
ax.invert_yaxis()
|
95 |
+
|
96 |
+
return histogram_fig, fig
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
# df_doc_topic
|
102 |
+
# print("Perplexity: ", lda_model.perplexity(countidf_vectors))
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
# sns.heatmap(df_doc_topic.corr())
|
108 |
+
# plt.show()
|
109 |
+
|
110 |
+
|
111 |
+
# fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
|
112 |
+
# axes = axes.flatten()
|
113 |
+
# for topic_idx, topic in enumerate(best_lda_model.components_):
|
114 |
+
# top_features_ind = topic.argsort()[:-10 - 1:-1]
|
115 |
+
# top_features = [vectorizer_count.get_feature_names_out()[i] for i in top_features_ind]
|
116 |
+
# weights = topic[top_features_ind]
|
117 |
+
|
118 |
+
# ax = axes[topic_idx]
|
119 |
+
# ax.barh(top_features, weights, height=0.7)
|
120 |
+
# ax.set_title(f'Topic {topic_idx +1}')
|
121 |
+
# ax.invert_yaxis()
|
122 |
+
# plt.show()
|
src/data_retrieval.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import praw
|
4 |
+
import requests
|
5 |
+
import datetime
|
6 |
+
import http.client
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from youtube_search import YoutubeSearch
|
9 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
10 |
+
from pytube import YouTube
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from urllib.parse import quote
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
def get_reddit_data(num_posts):
|
17 |
+
clientSecretKey = 'u8gnI-3_I70MZ0H52Wg-RYAytkWWeQ'
|
18 |
+
reddit = praw.Reddit(client_id="kMolVsEMMe0041y37FnL_Q",
|
19 |
+
client_secret=clientSecretKey,
|
20 |
+
user_agent="Scraper")
|
21 |
+
subreddit = reddit.subreddit("technews")
|
22 |
+
posts = []
|
23 |
+
|
24 |
+
for post in subreddit.hot(limit=num_posts):
|
25 |
+
url = post.url
|
26 |
+
try:
|
27 |
+
html_doc = requests.get(url).text
|
28 |
+
soup = BeautifulSoup(html_doc, 'html.parser')
|
29 |
+
for script_or_style in soup(["script", "style"]):
|
30 |
+
script_or_style.decompose()
|
31 |
+
text = ' '.join(soup.stripped_strings)
|
32 |
+
posts.append({'title': post.title, 'url': post.url, 'text': text})
|
33 |
+
except:
|
34 |
+
continue
|
35 |
+
return posts
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
# old newsapi section
|
40 |
+
# def get_news_data(query, num_articles):
|
41 |
+
# conn = http.client.HTTPSConnection("newsapi.org")
|
42 |
+
# fromDate = (datetime.datetime.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
|
43 |
+
# headers = {'Authorization': '0db7ab8d26b34533b00be11af29b8c73','User-Agent': 'Andys News Agent'}
|
44 |
+
# encoded_query = quote(query)
|
45 |
+
# conn.request("GET", f"/v2/everything?q={encoded_query}&from={fromDate}&pageSize={num_articles}", headers=headers)
|
46 |
+
# res = conn.getresponse().read()
|
47 |
+
# response_json = json.loads(res)
|
48 |
+
# articles = response_json.get('articles', [])
|
49 |
+
# cleaned_articles = [{'title': a['title'], 'url': a['url'], 'text': a['content']} for a in articles]
|
50 |
+
|
51 |
+
# return cleaned_articles
|
52 |
+
|
53 |
+
def get_full_text(url):
|
54 |
+
response = requests.get(url)
|
55 |
+
response.raise_for_status() # Check if the request was successful
|
56 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
57 |
+
paragraphs = soup.find_all('p') # Assume the text is in <p> tags
|
58 |
+
text = ' '.join([p.get_text() for p in paragraphs])
|
59 |
+
return text
|
60 |
+
|
61 |
+
def get_news_data(query, num_articles):
|
62 |
+
conn = http.client.HTTPSConnection("newsapi.org")
|
63 |
+
fromDate = (datetime.datetime.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
|
64 |
+
headers = {'Authorization': '0db7ab8d26b34533b00be11af29b8c73','User-Agent': 'Andys News Agent'}
|
65 |
+
encoded_query = quote(query)
|
66 |
+
conn.request("GET", f"/v2/everything?q={encoded_query}&from={fromDate}&pageSize={num_articles}", headers=headers)
|
67 |
+
res = conn.getresponse().read()
|
68 |
+
response_json = json.loads(res)
|
69 |
+
# print(json.dumps(response_json, indent=4))
|
70 |
+
articles = response_json.get('articles', [])
|
71 |
+
cleaned_articles = []
|
72 |
+
for a in articles:
|
73 |
+
try:
|
74 |
+
full_text = get_full_text(a['url'])
|
75 |
+
except Exception as e:
|
76 |
+
print(f"Failed to retrieve full text for {a['url']}: {e}")
|
77 |
+
full_text = a['content'] # Fall back to the snippet if the scrape fails
|
78 |
+
cleaned_articles.append({'title': a['title'], 'url': a['url'], 'text': full_text})
|
79 |
+
|
80 |
+
return cleaned_articles
|
81 |
+
|
82 |
+
def get_youtube_data(query, max_results):
|
83 |
+
search = YoutubeSearch(query, max_results=max_results)
|
84 |
+
results = search.to_dict()
|
85 |
+
videos = []
|
86 |
+
|
87 |
+
for result in results:
|
88 |
+
video_id = result['id']
|
89 |
+
yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
|
90 |
+
try:
|
91 |
+
transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
|
92 |
+
transcript = " ".join([entry['text'] for entry in transcript_data])
|
93 |
+
except Exception:
|
94 |
+
transcript = "Transcript not available"
|
95 |
+
videos.append({'title': yt.title, 'url': yt.watch_url, 'text': transcript})
|
96 |
+
|
97 |
+
return videos
|
98 |
+
|
99 |
+
def scrape(num_reddit_posts, num_news_articles, num_youtube_videos):
|
100 |
+
reddit_data = get_reddit_data(num_reddit_posts)
|
101 |
+
news_data = get_news_data('artificial intelligence', num_news_articles)
|
102 |
+
youtube_data = get_youtube_data('tech news', num_youtube_videos)
|
103 |
+
all_data = {
|
104 |
+
'reddit': reddit_data,
|
105 |
+
'news': news_data,
|
106 |
+
'youtube': youtube_data
|
107 |
+
}
|
108 |
+
|
109 |
+
filename = f'data/raw.json'
|
110 |
+
|
111 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
112 |
+
json_string = json.dumps(all_data, ensure_ascii=False, indent=4)
|
113 |
+
f.write(json_string)
|
114 |
+
return filename
|
src/gsheets.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gspread
|
2 |
+
from gspread_dataframe import set_with_dataframe
|
3 |
+
import pandas as pd
|
4 |
+
from oauth2client.service_account import ServiceAccountCredentials
|
5 |
+
from datetime import datetime
|
6 |
+
|
7 |
+
def upload_csv_to_new_worksheet(topic_string):
|
8 |
+
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
|
9 |
+
|
10 |
+
# link service account with roles set and api enabled
|
11 |
+
creds = ServiceAccountCredentials.from_json_keyfile_name('serviceaccount/gsheets-upload-403705-efeef293c71f.json', scope)
|
12 |
+
client = gspread.authorize(creds)
|
13 |
+
|
14 |
+
spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/12N10KBYoPwFnvu3iTRgfGhVVlNFeo06BxVDlcFnwSC4/edit#gid=1761713442')
|
15 |
+
|
16 |
+
# create a new "sheet" in the spreadsheet, name it the current date
|
17 |
+
current = datetime.now().strftime("%m/%d/%Y_%H:%M:%S")
|
18 |
+
worksheet = spreadsheet.add_worksheet(title=current, rows="100", cols="50")
|
19 |
+
|
20 |
+
data = pd.read_csv('data.csv')
|
21 |
+
set_with_dataframe(worksheet, data)
|
22 |
+
|
23 |
+
# do the same for topic model words
|
24 |
+
topic_sheet_name = f"{current}_topics"
|
25 |
+
topic_worksheet = spreadsheet.add_worksheet(title=topic_sheet_name, rows="100", cols="1")
|
26 |
+
|
27 |
+
# split by "\n\n" and write each topic to the new worksheet
|
28 |
+
topics = topic_string.split("\n\n")
|
29 |
+
for i, topic in enumerate(topics, start=1):
|
30 |
+
topic_worksheet.update_cell(i, 1, topic)
|
31 |
+
|
32 |
+
return f"Successfully uploaded worksheets: {current} and {topic_sheet_name} to 'https://docs.google.com/spreadsheets/d/12N10KBYoPwFnvu3iTRgfGhVVlNFeo06BxVDlcFnwSC4/edit#gid=1761713442"
|
33 |
+
|
src/summarizer.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import openai
|
4 |
+
import pandas as pd
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
|
9 |
+
def summarize(filename, gpt_key, model_name):
|
10 |
+
openai.api_key = gpt_key
|
11 |
+
|
12 |
+
# Opening created all data json file
|
13 |
+
f = open(filename)
|
14 |
+
allDataFile = json.load(f)
|
15 |
+
|
16 |
+
finaldf = pd.DataFrame()
|
17 |
+
|
18 |
+
for source, articles in allDataFile.items():
|
19 |
+
for article in articles:
|
20 |
+
|
21 |
+
title = article['title']
|
22 |
+
text = article['text']
|
23 |
+
combined_text = 'title: ' + title + '\n' + text
|
24 |
+
|
25 |
+
try:
|
26 |
+
# GPT-3.5 API for summarization
|
27 |
+
response = openai.ChatCompletion.create(
|
28 |
+
model=model_name,
|
29 |
+
messages=[{
|
30 |
+
"role": "system",
|
31 |
+
"content": "You are a helpful assistant."
|
32 |
+
}, {
|
33 |
+
"role":
|
34 |
+
"user",
|
35 |
+
"content":
|
36 |
+
f"Please summarize this news article text or youtube video transcript in four sentences or less. If no article/transcript is present, or it is unclear what the transcript is talking about, output 'Unable to summarize.'. {combined_text} "
|
37 |
+
}])
|
38 |
+
|
39 |
+
summarizedData = response['choices'][0]['message']['content']
|
40 |
+
print(f"SUMMARY: {summarizedData} \n\n")
|
41 |
+
|
42 |
+
# GPT-3.5 API for talking points from summarization generated
|
43 |
+
follow_up = openai.ChatCompletion.create(
|
44 |
+
model=model_name,
|
45 |
+
messages=[{
|
46 |
+
"role": "system",
|
47 |
+
"content": "You are a helpful assistant."
|
48 |
+
}, {
|
49 |
+
"role":
|
50 |
+
"user",
|
51 |
+
"content":
|
52 |
+
f"Using this article, give me five sequential talking points that I can use to make a shortform video. Do not use more than 100 words. If the summarized article says 'Unable to summarize,' output 'No talking points available'. {summarizedData}"
|
53 |
+
}])
|
54 |
+
|
55 |
+
talking_pointsData = follow_up['choices'][0]['message']['content']
|
56 |
+
print(f"TALKING POINTS: {talking_pointsData} \n\n")
|
57 |
+
|
58 |
+
articleinfo = pd.DataFrame.from_records([{
|
59 |
+
"title":
|
60 |
+
article["title"],
|
61 |
+
"source":
|
62 |
+
source,
|
63 |
+
"url":
|
64 |
+
article["url"],
|
65 |
+
"summarized_text":
|
66 |
+
summarizedData,
|
67 |
+
"talking_points":
|
68 |
+
talking_pointsData
|
69 |
+
}])
|
70 |
+
finaldf = pd.concat([finaldf, articleinfo], ignore_index=True)
|
71 |
+
|
72 |
+
except openai.error.InvalidRequestError as e:
|
73 |
+
print(f"An error occurred: {e}")
|
74 |
+
continue
|
75 |
+
|
76 |
+
csvname = 'data.csv'
|
77 |
+
finaldf.to_csv(csvname, index=False)
|
78 |
+
return csvname
|