ayang903 commited on
Commit
93f28f9
·
1 Parent(s): 5b38d99

Upload 10 files

Browse files
model_weights/best_lda_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cfd988597b1f33b832b416cb6741efe3defc297901a5928a3e7ce341153cc6
3
+ size 59652
serviceaccount/gsheets-upload-403705-efeef293c71f.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "gsheets-upload-403705",
4
+ "private_key_id": "efeef293c71f53fab4f01876a906ba0d41c9c3b2",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDBAA0A3H3heIQU\no2nE3DeZZduUZ/x5Xft7LZ+wLlOXecYkm9Fhs3uo5v4ljljZOcsycCQ8q0UN/iIV\n9/zSaXevEGAwGLFA/rATdS4bLVf+60eCvMS4J7zBqpbJOMk3lzgkMkcIqYMEf0wb\nN0itqFS39WLFi3E7YyCGutkfkjdRiHBhJvryvZo9IBnNGUR59BzyXY5euIChpTwC\nq81bgK1oUgoK5xuPmc0JNVGVQDJwRRCWauR3z0o+IdYTxa/RMP+I0tl+wIYhoWP+\n2Rvqnjg8MvBwb+aB6uuuKoHUY3Id5Kx/ALFtp9ad+HoU8bYDnLcIiJzHgSNt9zdb\nVKggS4Q1AgMBAAECggEAAX6EqXn9UCvYTQxhuTIP8FtfIo4PUoHkfJh7/y6xFKcW\nM6+UCD84pyYNyYz7TTnnYDgXfRAXDvEszWH2Lmf2vYafiq/I64e7euoyKwTrPYsP\nxw+CTGLq0SPqk37McVg8wk/e5JWNm7uxrfsmpYH8LXtQl523eqbyplopM2WmkW8u\nr1J9MX6hCV5IC77EueovFDHmaAnf72Ud+JLcvFImzi0hd7dxU9sLMcg7uItCF6Fl\njh+mZGA2jJQxNQUT4C4tSRCXkm6C8pSN3WLdcX4I/X/LA6QUanrg9pbu379BH7wp\nvjhUSvZeG9+NJXWuQGzamNrKLIPZJBEzB0zREVQaDwKBgQD7YOp3BnwPfzO6L6Lq\napl/hNKeBf5pCb/0QL0Hq9UoItBJqX4Pep0dJrbMZaesApG2+RsaN5tz+GLm7SzY\n3IH9Sq2F4HaM5JCKlwmye5wHFECqSs7ktpioBaljaEpYuil4wRbzWHnuP/4fjjUo\nU0knVzWIzTzn7GoJsguY0mNX/wKBgQDEjGJDoTmCcj/c3mPPagVg1OaaRMu1WfqC\nv78Tr51ivbRisktz+smlksjiBbOBaUcA8dTXcKRGqp3wP8PAJi+9pJodl5NcjzGI\nO2gQ/HW9y1wb/qzyfNQETEDOz0Ke9XvWPGetAV13kGhVlmFuxiotbPW5FwrcKXLz\nKZoMvINDywKBgQCaXOUdugmsqnvlNSNht5wSxklfaGbVsXsCTk7FyyrVvqsQ0Nfs\nQWsBX6iY00OnSNyZ81ZFPyhiioCRNct4T9Ay7gyoTTH/SsvHjwARbf5eCn27FLz/\njXEonHFr7brZyVd2I3woaohVWU5/qh/SZ3JgihkBrKZd9LsYwRCGA4ulmQKBgBkI\ndD9+2k9F8+JSpM23CCZUF2bQmk1nv2NFvrVoKZh45u+nG7sS1vnynwlChqFV4kg5\nhM1HuHSTqHf/9xOTCYOS4loggxFH35wlTNTVAr4Al6OtJSPhSDOf7qUoeqi6RWJ8\n4QuE3/2pc9BqzdAJBzgv54ACckymLtDPnKJApEtPAoGAewTD9XtIRL9FxN6i3JSJ\nafALdnZ5ykrN5V2II95x+2P1hHj1mlzDdQ6LOmOOlsivN4JV3HynYLeITLmNb9HC\no7JAKnUxY014knDzhypQvAtv0p3NcE9j2rkdqAVjx5mtfkQv095cqXc5AwSGjsIh\nI30uIg5qhqTkmqTVTcRwUpM=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "[email protected]",
7
+ "client_id": "102153694664172854211",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gsheets-uploader%40gsheets-upload-403705.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }
src/__pycache__/data_preprocessing.cpython-311.pyc ADDED
Binary file (7.49 kB). View file
 
src/__pycache__/data_retrieval.cpython-311.pyc ADDED
Binary file (6.55 kB). View file
 
src/__pycache__/gsheets.cpython-311.pyc ADDED
Binary file (2.2 kB). View file
 
src/__pycache__/summarizer.cpython-311.pyc ADDED
Binary file (3.12 kB). View file
 
src/data_preprocessing.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import nltk
3
+ nltk.download('punkt')
4
+ import re
5
+ from sklearn.feature_extraction.text import CountVectorizer
6
+ from joblib import load
7
+ import pandas as pd
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ # import seaborn as sns
11
+ # import matplotlib.pyplot as plt
12
+
13
+
14
+ nltk.download('stopwords')
15
+ stop_words = set(nltk.corpus.stopwords.words('english'))
16
+
17
+ def tokenize(text):
18
+ wordstoremove = ['Thomas', 'thing', 'quite', 'exist', 'live', 'things', 'you\'re', 'we\'ll', 'really', 'right',
19
+ 'said', 'right', 'refresh', 'realized', 'realize', 'wrong', 'means', 'stuff', 'wants', 'like',
20
+ 'going', 'exactly', 'feel', 'probably', 'likely', 'likes', 'thank', 'oopsie', 'rightfully', 'paul', '23andme', 'didn', 'know', 'just', 'really', 'able', 'actually', 'comes', 'does', 'left']
21
+ tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) and word not in wordstoremove ]
22
+ tokens = map(str.lower, tokens)
23
+ return tokens
24
+
25
+ def lda(input_file):
26
+
27
+ with open(input_file, 'r', encoding='utf-8') as f:
28
+ data = json.load(f)
29
+
30
+ df = pd.DataFrame(columns=["title", "url", "source", "text"])
31
+
32
+ dfs_to_concat = []
33
+ for source, articles in data.items():
34
+ for article in articles:
35
+ new_df = pd.DataFrame({
36
+ "title": [article["title"]],
37
+ "url": [article["url"]],
38
+ "source": [source],
39
+ "text": [article["text"]]
40
+ })
41
+
42
+ dfs_to_concat.append(new_df)
43
+ df = pd.concat([df] + dfs_to_concat, ignore_index=True)
44
+
45
+
46
+ vectorizer_count = CountVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.50, max_features=500, lowercase=False, ngram_range=(1,2))
47
+ countidf_vectors = vectorizer_count.fit_transform(df.text)
48
+
49
+ feature_names = vectorizer_count.get_feature_names_out()
50
+
51
+ lda_model = load('model_weights/best_lda_model.joblib')
52
+ W1 = lda_model.fit_transform(countidf_vectors)
53
+ H1 = lda_model.components_
54
+
55
+
56
+ num_words=15
57
+
58
+ vocab = np.array(feature_names)
59
+
60
+ top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
61
+ topic_words = ([top_words(t) for t in H1])
62
+ topics = [' '.join(t) for t in topic_words]
63
+ topics_str = '\n\n'.join(topics)
64
+
65
+ histo, barchart = visualize(topics, df, W1, H1, lda_model, vectorizer_count)
66
+ print("done")
67
+ return topics_str, histo, barchart
68
+
69
+ def visualize(topics, df, W1, H1, lda_model, vectorizer):
70
+ #label each document with a topic
71
+ colnames = ["Topic" + str(i+1) for i in range(lda_model.n_components)]
72
+ docnames = df['title']
73
+
74
+ df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index=docnames)
75
+ significant_topic = np.argmax(df_doc_topic.values, axis=1)
76
+
77
+ #histogram of common topics
78
+ df_doc_topic['dominant_topic'] = significant_topic + 1
79
+ histogram_fig, histogram_ax = plt.subplots()
80
+ df_doc_topic['dominant_topic'].hist(bins=7, ax=histogram_ax)
81
+ histogram_ax.set_title('Histogram of Dominant Topics')
82
+
83
+ #words of each topic
84
+ fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
85
+ axes = axes.flatten()
86
+ for topic_idx, topic in enumerate(lda_model.components_):
87
+ top_features_ind = topic.argsort()[:-10 - 1:-1]
88
+ top_features = [vectorizer.get_feature_names_out()[i] for i in top_features_ind]
89
+ weights = topic[top_features_ind]
90
+
91
+ ax = axes[topic_idx]
92
+ ax.barh(top_features, weights, height=0.7)
93
+ ax.set_title(f'Topic {topic_idx +1}')
94
+ ax.invert_yaxis()
95
+
96
+ return histogram_fig, fig
97
+
98
+
99
+
100
+
101
+ # df_doc_topic
102
+ # print("Perplexity: ", lda_model.perplexity(countidf_vectors))
103
+
104
+
105
+
106
+
107
+ # sns.heatmap(df_doc_topic.corr())
108
+ # plt.show()
109
+
110
+
111
+ # fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
112
+ # axes = axes.flatten()
113
+ # for topic_idx, topic in enumerate(best_lda_model.components_):
114
+ # top_features_ind = topic.argsort()[:-10 - 1:-1]
115
+ # top_features = [vectorizer_count.get_feature_names_out()[i] for i in top_features_ind]
116
+ # weights = topic[top_features_ind]
117
+
118
+ # ax = axes[topic_idx]
119
+ # ax.barh(top_features, weights, height=0.7)
120
+ # ax.set_title(f'Topic {topic_idx +1}')
121
+ # ax.invert_yaxis()
122
+ # plt.show()
src/data_retrieval.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import praw
4
+ import requests
5
+ import datetime
6
+ import http.client
7
+ from bs4 import BeautifulSoup
8
+ from youtube_search import YoutubeSearch
9
+ from youtube_transcript_api import YouTubeTranscriptApi
10
+ from pytube import YouTube
11
+ from dotenv import load_dotenv
12
+ from urllib.parse import quote
13
+
14
+ load_dotenv()
15
+
16
+ def get_reddit_data(num_posts):
17
+ clientSecretKey = 'u8gnI-3_I70MZ0H52Wg-RYAytkWWeQ'
18
+ reddit = praw.Reddit(client_id="kMolVsEMMe0041y37FnL_Q",
19
+ client_secret=clientSecretKey,
20
+ user_agent="Scraper")
21
+ subreddit = reddit.subreddit("technews")
22
+ posts = []
23
+
24
+ for post in subreddit.hot(limit=num_posts):
25
+ url = post.url
26
+ try:
27
+ html_doc = requests.get(url).text
28
+ soup = BeautifulSoup(html_doc, 'html.parser')
29
+ for script_or_style in soup(["script", "style"]):
30
+ script_or_style.decompose()
31
+ text = ' '.join(soup.stripped_strings)
32
+ posts.append({'title': post.title, 'url': post.url, 'text': text})
33
+ except:
34
+ continue
35
+ return posts
36
+
37
+
38
+
39
+ # old newsapi section
40
+ # def get_news_data(query, num_articles):
41
+ # conn = http.client.HTTPSConnection("newsapi.org")
42
+ # fromDate = (datetime.datetime.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
43
+ # headers = {'Authorization': '0db7ab8d26b34533b00be11af29b8c73','User-Agent': 'Andys News Agent'}
44
+ # encoded_query = quote(query)
45
+ # conn.request("GET", f"/v2/everything?q={encoded_query}&from={fromDate}&pageSize={num_articles}", headers=headers)
46
+ # res = conn.getresponse().read()
47
+ # response_json = json.loads(res)
48
+ # articles = response_json.get('articles', [])
49
+ # cleaned_articles = [{'title': a['title'], 'url': a['url'], 'text': a['content']} for a in articles]
50
+
51
+ # return cleaned_articles
52
+
53
+ def get_full_text(url):
54
+ response = requests.get(url)
55
+ response.raise_for_status() # Check if the request was successful
56
+ soup = BeautifulSoup(response.text, 'html.parser')
57
+ paragraphs = soup.find_all('p') # Assume the text is in <p> tags
58
+ text = ' '.join([p.get_text() for p in paragraphs])
59
+ return text
60
+
61
+ def get_news_data(query, num_articles):
62
+ conn = http.client.HTTPSConnection("newsapi.org")
63
+ fromDate = (datetime.datetime.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
64
+ headers = {'Authorization': '0db7ab8d26b34533b00be11af29b8c73','User-Agent': 'Andys News Agent'}
65
+ encoded_query = quote(query)
66
+ conn.request("GET", f"/v2/everything?q={encoded_query}&from={fromDate}&pageSize={num_articles}", headers=headers)
67
+ res = conn.getresponse().read()
68
+ response_json = json.loads(res)
69
+ # print(json.dumps(response_json, indent=4))
70
+ articles = response_json.get('articles', [])
71
+ cleaned_articles = []
72
+ for a in articles:
73
+ try:
74
+ full_text = get_full_text(a['url'])
75
+ except Exception as e:
76
+ print(f"Failed to retrieve full text for {a['url']}: {e}")
77
+ full_text = a['content'] # Fall back to the snippet if the scrape fails
78
+ cleaned_articles.append({'title': a['title'], 'url': a['url'], 'text': full_text})
79
+
80
+ return cleaned_articles
81
+
82
+ def get_youtube_data(query, max_results):
83
+ search = YoutubeSearch(query, max_results=max_results)
84
+ results = search.to_dict()
85
+ videos = []
86
+
87
+ for result in results:
88
+ video_id = result['id']
89
+ yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
90
+ try:
91
+ transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
92
+ transcript = " ".join([entry['text'] for entry in transcript_data])
93
+ except Exception:
94
+ transcript = "Transcript not available"
95
+ videos.append({'title': yt.title, 'url': yt.watch_url, 'text': transcript})
96
+
97
+ return videos
98
+
99
+ def scrape(num_reddit_posts, num_news_articles, num_youtube_videos):
100
+ reddit_data = get_reddit_data(num_reddit_posts)
101
+ news_data = get_news_data('artificial intelligence', num_news_articles)
102
+ youtube_data = get_youtube_data('tech news', num_youtube_videos)
103
+ all_data = {
104
+ 'reddit': reddit_data,
105
+ 'news': news_data,
106
+ 'youtube': youtube_data
107
+ }
108
+
109
+ filename = f'data/raw.json'
110
+
111
+ with open(filename, 'w', encoding='utf-8') as f:
112
+ json_string = json.dumps(all_data, ensure_ascii=False, indent=4)
113
+ f.write(json_string)
114
+ return filename
src/gsheets.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gspread
2
+ from gspread_dataframe import set_with_dataframe
3
+ import pandas as pd
4
+ from oauth2client.service_account import ServiceAccountCredentials
5
+ from datetime import datetime
6
+
7
+ def upload_csv_to_new_worksheet(topic_string):
8
+ scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
9
+
10
+ # link service account with roles set and api enabled
11
+ creds = ServiceAccountCredentials.from_json_keyfile_name('serviceaccount/gsheets-upload-403705-efeef293c71f.json', scope)
12
+ client = gspread.authorize(creds)
13
+
14
+ spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/12N10KBYoPwFnvu3iTRgfGhVVlNFeo06BxVDlcFnwSC4/edit#gid=1761713442')
15
+
16
+ # create a new "sheet" in the spreadsheet, name it the current date
17
+ current = datetime.now().strftime("%m/%d/%Y_%H:%M:%S")
18
+ worksheet = spreadsheet.add_worksheet(title=current, rows="100", cols="50")
19
+
20
+ data = pd.read_csv('data.csv')
21
+ set_with_dataframe(worksheet, data)
22
+
23
+ # do the same for topic model words
24
+ topic_sheet_name = f"{current}_topics"
25
+ topic_worksheet = spreadsheet.add_worksheet(title=topic_sheet_name, rows="100", cols="1")
26
+
27
+ # split by "\n\n" and write each topic to the new worksheet
28
+ topics = topic_string.split("\n\n")
29
+ for i, topic in enumerate(topics, start=1):
30
+ topic_worksheet.update_cell(i, 1, topic)
31
+
32
+ return f"Successfully uploaded worksheets: {current} and {topic_sheet_name} to 'https://docs.google.com/spreadsheets/d/12N10KBYoPwFnvu3iTRgfGhVVlNFeo06BxVDlcFnwSC4/edit#gid=1761713442"
33
+
src/summarizer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import openai
4
+ import pandas as pd
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+
9
+ def summarize(filename, gpt_key, model_name):
10
+ openai.api_key = gpt_key
11
+
12
+ # Opening created all data json file
13
+ f = open(filename)
14
+ allDataFile = json.load(f)
15
+
16
+ finaldf = pd.DataFrame()
17
+
18
+ for source, articles in allDataFile.items():
19
+ for article in articles:
20
+
21
+ title = article['title']
22
+ text = article['text']
23
+ combined_text = 'title: ' + title + '\n' + text
24
+
25
+ try:
26
+ # GPT-3.5 API for summarization
27
+ response = openai.ChatCompletion.create(
28
+ model=model_name,
29
+ messages=[{
30
+ "role": "system",
31
+ "content": "You are a helpful assistant."
32
+ }, {
33
+ "role":
34
+ "user",
35
+ "content":
36
+ f"Please summarize this news article text or youtube video transcript in four sentences or less. If no article/transcript is present, or it is unclear what the transcript is talking about, output 'Unable to summarize.'. {combined_text} "
37
+ }])
38
+
39
+ summarizedData = response['choices'][0]['message']['content']
40
+ print(f"SUMMARY: {summarizedData} \n\n")
41
+
42
+ # GPT-3.5 API for talking points from summarization generated
43
+ follow_up = openai.ChatCompletion.create(
44
+ model=model_name,
45
+ messages=[{
46
+ "role": "system",
47
+ "content": "You are a helpful assistant."
48
+ }, {
49
+ "role":
50
+ "user",
51
+ "content":
52
+ f"Using this article, give me five sequential talking points that I can use to make a shortform video. Do not use more than 100 words. If the summarized article says 'Unable to summarize,' output 'No talking points available'. {summarizedData}"
53
+ }])
54
+
55
+ talking_pointsData = follow_up['choices'][0]['message']['content']
56
+ print(f"TALKING POINTS: {talking_pointsData} \n\n")
57
+
58
+ articleinfo = pd.DataFrame.from_records([{
59
+ "title":
60
+ article["title"],
61
+ "source":
62
+ source,
63
+ "url":
64
+ article["url"],
65
+ "summarized_text":
66
+ summarizedData,
67
+ "talking_points":
68
+ talking_pointsData
69
+ }])
70
+ finaldf = pd.concat([finaldf, articleinfo], ignore_index=True)
71
+
72
+ except openai.error.InvalidRequestError as e:
73
+ print(f"An error occurred: {e}")
74
+ continue
75
+
76
+ csvname = 'data.csv'
77
+ finaldf.to_csv(csvname, index=False)
78
+ return csvname