Spaces:

ayang903
/

content_generator

Sleeping

App Files Files Community

ayang903 commited on Nov 14, 2023

Commit

93f28f9

1 Parent(s): 5b38d99

Upload 10 files

Browse files

Files changed (10) hide show

model_weights/best_lda_model.joblib +3 -0
serviceaccount/gsheets-upload-403705-efeef293c71f.json +13 -0
src/__pycache__/data_preprocessing.cpython-311.pyc +0 -0
src/__pycache__/data_retrieval.cpython-311.pyc +0 -0
src/__pycache__/gsheets.cpython-311.pyc +0 -0
src/__pycache__/summarizer.cpython-311.pyc +0 -0
src/data_preprocessing.py +122 -0
src/data_retrieval.py +114 -0
src/gsheets.py +33 -0
src/summarizer.py +78 -0

model_weights/best_lda_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62cfd988597b1f33b832b416cb6741efe3defc297901a5928a3e7ce341153cc6
+size 59652

serviceaccount/gsheets-upload-403705-efeef293c71f.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "type": "service_account",
+  "project_id": "gsheets-upload-403705",
+  "private_key_id": "efeef293c71f53fab4f01876a906ba0d41c9c3b2",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDBAA0A3H3heIQU\no2nE3DeZZduUZ/x5Xft7LZ+wLlOXecYkm9Fhs3uo5v4ljljZOcsycCQ8q0UN/iIV\n9/zSaXevEGAwGLFA/rATdS4bLVf+60eCvMS4J7zBqpbJOMk3lzgkMkcIqYMEf0wb\nN0itqFS39WLFi3E7YyCGutkfkjdRiHBhJvryvZo9IBnNGUR59BzyXY5euIChpTwC\nq81bgK1oUgoK5xuPmc0JNVGVQDJwRRCWauR3z0o+IdYTxa/RMP+I0tl+wIYhoWP+\n2Rvqnjg8MvBwb+aB6uuuKoHUY3Id5Kx/ALFtp9ad+HoU8bYDnLcIiJzHgSNt9zdb\nVKggS4Q1AgMBAAECggEAAX6EqXn9UCvYTQxhuTIP8FtfIo4PUoHkfJh7/y6xFKcW\nM6+UCD84pyYNyYz7TTnnYDgXfRAXDvEszWH2Lmf2vYafiq/I64e7euoyKwTrPYsP\nxw+CTGLq0SPqk37McVg8wk/e5JWNm7uxrfsmpYH8LXtQl523eqbyplopM2WmkW8u\nr1J9MX6hCV5IC77EueovFDHmaAnf72Ud+JLcvFImzi0hd7dxU9sLMcg7uItCF6Fl\njh+mZGA2jJQxNQUT4C4tSRCXkm6C8pSN3WLdcX4I/X/LA6QUanrg9pbu379BH7wp\nvjhUSvZeG9+NJXWuQGzamNrKLIPZJBEzB0zREVQaDwKBgQD7YOp3BnwPfzO6L6Lq\napl/hNKeBf5pCb/0QL0Hq9UoItBJqX4Pep0dJrbMZaesApG2+RsaN5tz+GLm7SzY\n3IH9Sq2F4HaM5JCKlwmye5wHFECqSs7ktpioBaljaEpYuil4wRbzWHnuP/4fjjUo\nU0knVzWIzTzn7GoJsguY0mNX/wKBgQDEjGJDoTmCcj/c3mPPagVg1OaaRMu1WfqC\nv78Tr51ivbRisktz+smlksjiBbOBaUcA8dTXcKRGqp3wP8PAJi+9pJodl5NcjzGI\nO2gQ/HW9y1wb/qzyfNQETEDOz0Ke9XvWPGetAV13kGhVlmFuxiotbPW5FwrcKXLz\nKZoMvINDywKBgQCaXOUdugmsqnvlNSNht5wSxklfaGbVsXsCTk7FyyrVvqsQ0Nfs\nQWsBX6iY00OnSNyZ81ZFPyhiioCRNct4T9Ay7gyoTTH/SsvHjwARbf5eCn27FLz/\njXEonHFr7brZyVd2I3woaohVWU5/qh/SZ3JgihkBrKZd9LsYwRCGA4ulmQKBgBkI\ndD9+2k9F8+JSpM23CCZUF2bQmk1nv2NFvrVoKZh45u+nG7sS1vnynwlChqFV4kg5\nhM1HuHSTqHf/9xOTCYOS4loggxFH35wlTNTVAr4Al6OtJSPhSDOf7qUoeqi6RWJ8\n4QuE3/2pc9BqzdAJBzgv54ACckymLtDPnKJApEtPAoGAewTD9XtIRL9FxN6i3JSJ\nafALdnZ5ykrN5V2II95x+2P1hHj1mlzDdQ6LOmOOlsivN4JV3HynYLeITLmNb9HC\no7JAKnUxY014knDzhypQvAtv0p3NcE9j2rkdqAVjx5mtfkQv095cqXc5AwSGjsIh\nI30uIg5qhqTkmqTVTcRwUpM=\n-----END PRIVATE KEY-----\n",
+  "client_email": "[email protected]",
+  "client_id": "102153694664172854211",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gsheets-uploader%40gsheets-upload-403705.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}

src/__pycache__/data_preprocessing.cpython-311.pyc ADDED Viewed

Binary file (7.49 kB). View file

src/__pycache__/data_retrieval.cpython-311.pyc ADDED Viewed

Binary file (6.55 kB). View file

src/__pycache__/gsheets.cpython-311.pyc ADDED Viewed

Binary file (2.2 kB). View file

src/__pycache__/summarizer.cpython-311.pyc ADDED Viewed

Binary file (3.12 kB). View file

src/data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import json
+import nltk
+nltk.download('punkt')
+import re
+from sklearn.feature_extraction.text import CountVectorizer
+from joblib import load
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+# import seaborn as sns
+# import matplotlib.pyplot as plt
+nltk.download('stopwords')
+stop_words = set(nltk.corpus.stopwords.words('english'))
+def tokenize(text):
+    wordstoremove = ['Thomas', 'thing', 'quite', 'exist', 'live', 'things', 'you\'re', 'we\'ll', 'really', 'right',
+                     'said', 'right', 'refresh', 'realized', 'realize', 'wrong', 'means', 'stuff', 'wants', 'like',
+                     'going', 'exactly', 'feel', 'probably', 'likely', 'likes', 'thank', 'oopsie', 'rightfully', 'paul', '23andme', 'didn', 'know', 'just', 'really', 'able', 'actually', 'comes', 'does', 'left']
+    tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) and word not in wordstoremove ]
+    tokens = map(str.lower, tokens)
+    return tokens
+def lda(input_file):
+  with open(input_file, 'r', encoding='utf-8') as f:
+    data = json.load(f)
+  df = pd.DataFrame(columns=["title", "url", "source", "text"])
+  dfs_to_concat = []
+  for source, articles in data.items():
+      for article in articles:
+          new_df = pd.DataFrame({
+              "title": [article["title"]],
+              "url": [article["url"]],
+              "source": [source],
+              "text": [article["text"]]
+          })
+          dfs_to_concat.append(new_df)
+  df = pd.concat([df] + dfs_to_concat, ignore_index=True)
+  vectorizer_count = CountVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.50, max_features=500, lowercase=False, ngram_range=(1,2))
+  countidf_vectors = vectorizer_count.fit_transform(df.text)
+  feature_names = vectorizer_count.get_feature_names_out()
+  lda_model = load('model_weights/best_lda_model.joblib')
+  W1 = lda_model.fit_transform(countidf_vectors)
+  H1 = lda_model.components_
+  num_words=15
+  vocab = np.array(feature_names)
+  top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
+  topic_words = ([top_words(t) for t in H1])
+  topics = [' '.join(t) for t in topic_words]
+  topics_str = '\n\n'.join(topics)
+  histo, barchart = visualize(topics, df, W1, H1, lda_model, vectorizer_count)
+  print("done")
+  return topics_str, histo, barchart
+def visualize(topics, df, W1, H1, lda_model, vectorizer):
+  #label each document with a topic
+  colnames = ["Topic" + str(i+1) for i in range(lda_model.n_components)]
+  docnames = df['title']
+  df_doc_topic = pd.DataFrame(np.round(W1, 2), columns = colnames, index=docnames)
+  significant_topic = np.argmax(df_doc_topic.values, axis=1)
+  #histogram of common topics
+  df_doc_topic['dominant_topic'] = significant_topic + 1
+  histogram_fig, histogram_ax = plt.subplots()
+  df_doc_topic['dominant_topic'].hist(bins=7, ax=histogram_ax)
+  histogram_ax.set_title('Histogram of Dominant Topics')
+  #words of each topic
+  fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
+  axes = axes.flatten()
+  for topic_idx, topic in enumerate(lda_model.components_):
+    top_features_ind = topic.argsort()[:-10 - 1:-1]
+    top_features = [vectorizer.get_feature_names_out()[i] for i in top_features_ind]
+    weights = topic[top_features_ind]
+    ax = axes[topic_idx]
+    ax.barh(top_features, weights, height=0.7)
+    ax.set_title(f'Topic {topic_idx +1}')
+    ax.invert_yaxis()
+  return histogram_fig, fig
+  # df_doc_topic
+  # print("Perplexity: ", lda_model.perplexity(countidf_vectors))
+  # sns.heatmap(df_doc_topic.corr())
+  # plt.show()
+  # fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
+  # axes = axes.flatten()
+  # for topic_idx, topic in enumerate(best_lda_model.components_):
+  #     top_features_ind = topic.argsort()[:-10 - 1:-1]
+  #     top_features = [vectorizer_count.get_feature_names_out()[i] for i in top_features_ind]
+  #     weights = topic[top_features_ind]
+  #     ax = axes[topic_idx]
+  #     ax.barh(top_features, weights, height=0.7)
+  #     ax.set_title(f'Topic {topic_idx +1}')
+  #     ax.invert_yaxis()
+  # plt.show()

src/data_retrieval.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import json
+import praw
+import requests
+import datetime
+import http.client
+from bs4 import BeautifulSoup
+from youtube_search import YoutubeSearch
+from youtube_transcript_api import YouTubeTranscriptApi
+from pytube import YouTube
+from dotenv import load_dotenv
+from urllib.parse import quote
+load_dotenv()
+def get_reddit_data(num_posts):
+    clientSecretKey = 'u8gnI-3_I70MZ0H52Wg-RYAytkWWeQ'
+    reddit = praw.Reddit(client_id="kMolVsEMMe0041y37FnL_Q",
+                         client_secret=clientSecretKey,
+                         user_agent="Scraper")
+    subreddit = reddit.subreddit("technews")
+    posts = []
+    for post in subreddit.hot(limit=num_posts):
+        url = post.url
+        try:
+          html_doc = requests.get(url).text
+          soup = BeautifulSoup(html_doc, 'html.parser')
+          for script_or_style in soup(["script", "style"]):
+              script_or_style.decompose()
+          text = ' '.join(soup.stripped_strings)
+          posts.append({'title': post.title, 'url': post.url, 'text': text})
+        except:
+          continue
+    return posts
+# old newsapi section
+# def get_news_data(query, num_articles):
+#   conn = http.client.HTTPSConnection("newsapi.org")
+#   fromDate = (datetime.datetime.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
+#   headers = {'Authorization': '0db7ab8d26b34533b00be11af29b8c73','User-Agent': 'Andys News Agent'}
+#   encoded_query = quote(query)
+#   conn.request("GET", f"/v2/everything?q={encoded_query}&from={fromDate}&pageSize={num_articles}", headers=headers)
+#   res = conn.getresponse().read()
+#   response_json = json.loads(res)
+#   articles = response_json.get('articles', [])
+#   cleaned_articles = [{'title': a['title'], 'url': a['url'], 'text': a['content']} for a in articles]
+#   return cleaned_articles
+def get_full_text(url):
+  response = requests.get(url)
+  response.raise_for_status()  # Check if the request was successful
+  soup = BeautifulSoup(response.text, 'html.parser')
+  paragraphs = soup.find_all('p')  # Assume the text is in <p> tags
+  text = ' '.join([p.get_text() for p in paragraphs])
+  return text
+def get_news_data(query, num_articles):
+  conn = http.client.HTTPSConnection("newsapi.org")
+  fromDate = (datetime.datetime.today() - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
+  headers = {'Authorization': '0db7ab8d26b34533b00be11af29b8c73','User-Agent': 'Andys News Agent'}
+  encoded_query = quote(query)
+  conn.request("GET", f"/v2/everything?q={encoded_query}&from={fromDate}&pageSize={num_articles}", headers=headers)
+  res = conn.getresponse().read()
+  response_json = json.loads(res)
+  # print(json.dumps(response_json, indent=4))
+  articles = response_json.get('articles', [])
+  cleaned_articles = []
+  for a in articles:
+      try:
+          full_text = get_full_text(a['url'])
+      except Exception as e:
+          print(f"Failed to retrieve full text for {a['url']}: {e}")
+          full_text = a['content']  # Fall back to the snippet if the scrape fails
+      cleaned_articles.append({'title': a['title'], 'url': a['url'], 'text': full_text})
+  return cleaned_articles
+def get_youtube_data(query, max_results):
+    search = YoutubeSearch(query, max_results=max_results)
+    results = search.to_dict()
+    videos = []
+    for result in results:
+        video_id = result['id']
+        yt = YouTube(f"https://www.youtube.com/watch?v={video_id}")
+        try:
+            transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
+            transcript = " ".join([entry['text'] for entry in transcript_data])
+        except Exception:
+            transcript = "Transcript not available"
+        videos.append({'title': yt.title, 'url': yt.watch_url, 'text': transcript})
+    return videos
+def scrape(num_reddit_posts, num_news_articles, num_youtube_videos):
+    reddit_data = get_reddit_data(num_reddit_posts)
+    news_data = get_news_data('artificial intelligence', num_news_articles)
+    youtube_data = get_youtube_data('tech news', num_youtube_videos)
+    all_data = {
+        'reddit': reddit_data,
+        'news': news_data,
+        'youtube': youtube_data
+    }
+    filename = f'data/raw.json'
+    with open(filename, 'w', encoding='utf-8') as f:
+        json_string = json.dumps(all_data, ensure_ascii=False, indent=4)
+        f.write(json_string)
+    return filename

src/gsheets.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import gspread
+from gspread_dataframe import set_with_dataframe
+import pandas as pd
+from oauth2client.service_account import ServiceAccountCredentials
+from datetime import datetime
+def upload_csv_to_new_worksheet(topic_string):
+    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
+  # link service account with roles set and api enabled
+    creds = ServiceAccountCredentials.from_json_keyfile_name('serviceaccount/gsheets-upload-403705-efeef293c71f.json', scope)
+    client = gspread.authorize(creds)
+    spreadsheet = client.open_by_url('https://docs.google.com/spreadsheets/d/12N10KBYoPwFnvu3iTRgfGhVVlNFeo06BxVDlcFnwSC4/edit#gid=1761713442')
+    # create a new "sheet" in the spreadsheet, name it the current date
+    current = datetime.now().strftime("%m/%d/%Y_%H:%M:%S")
+    worksheet = spreadsheet.add_worksheet(title=current, rows="100", cols="50")
+    data = pd.read_csv('data.csv')
+    set_with_dataframe(worksheet, data)
+  # do the same for topic model words
+    topic_sheet_name = f"{current}_topics"
+    topic_worksheet = spreadsheet.add_worksheet(title=topic_sheet_name, rows="100", cols="1")
+    # split by "\n\n" and write each topic to the new worksheet
+    topics = topic_string.split("\n\n")
+    for i, topic in enumerate(topics, start=1):
+        topic_worksheet.update_cell(i, 1, topic)
+    return f"Successfully uploaded worksheets: {current} and {topic_sheet_name} to 'https://docs.google.com/spreadsheets/d/12N10KBYoPwFnvu3iTRgfGhVVlNFeo06BxVDlcFnwSC4/edit#gid=1761713442"

src/summarizer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import json
+import openai
+import pandas as pd
+from dotenv import load_dotenv
+load_dotenv()
+def summarize(filename, gpt_key, model_name):
+  openai.api_key = gpt_key
+  # Opening created all data json file
+  f = open(filename)
+  allDataFile = json.load(f)
+  finaldf = pd.DataFrame()
+  for source, articles in allDataFile.items():
+    for article in articles:
+      title = article['title']
+      text = article['text']
+      combined_text = 'title: ' + title + '\n' + text
+      try:
+        # GPT-3.5 API for summarization
+        response = openai.ChatCompletion.create(
+            model=model_name,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant."
+            }, {
+                "role":
+                "user",
+                "content":
+                f"Please summarize this news article text or youtube video transcript in four sentences or less. If no article/transcript is present, or it is unclear what the transcript is talking about, output 'Unable to summarize.'. {combined_text} "
+            }])
+        summarizedData = response['choices'][0]['message']['content']
+        print(f"SUMMARY: {summarizedData} \n\n")
+        # GPT-3.5 API for talking points from summarization generated
+        follow_up = openai.ChatCompletion.create(
+          model=model_name,
+          messages=[{
+            "role": "system",
+                "content": "You are a helpful assistant."
+            }, {
+                "role":
+                "user",
+                "content":
+                f"Using this article, give me five sequential talking points that I can use to make a shortform video. Do not use more than 100 words. If the summarized article says 'Unable to summarize,' output 'No talking points available'. {summarizedData}"
+          }])
+        talking_pointsData = follow_up['choices'][0]['message']['content']
+        print(f"TALKING POINTS: {talking_pointsData} \n\n")
+        articleinfo = pd.DataFrame.from_records([{
+            "title":
+            article["title"],
+            "source":
+            source,
+            "url":
+            article["url"],
+            "summarized_text":
+            summarizedData,
+            "talking_points":
+            talking_pointsData
+        }])
+        finaldf = pd.concat([finaldf, articleinfo], ignore_index=True)
+      except openai.error.InvalidRequestError as e:
+        print(f"An error occurred: {e}")
+        continue
+  csvname = 'data.csv'
+  finaldf.to_csv(csvname, index=False)
+  return csvname