Spaces:
Sleeping
Sleeping
File size: 5,985 Bytes
cbce622 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
#!/usr/bin/env python3
from tweepy import TooManyRequests
import os
import pandas as pd
import pickle
import yaml
import boto3
from helper.twitter_client_wrapper import (
format_tweets_df, format_users_df, format_context_annotations,
load_topic_domains, load_topic_entities, TwitterClientWrapper
)
COVID_IDS_PATH = "covid_ids.parquet.gzip"
STEP_SIZE = 100
def run(twitter_client, directory, covid_tweets_ids, gather_retweets=True, push_to_remote=True):
topic_domains = load_topic_domains(f'{directory}topic_domains.pickle')
topic_entities = load_topic_entities(f'{directory}topic_entities.pickle')
# List where we accumulate the tweets retrieved so far
collected_tweets = []
# List where we accumulate the users retrieved so far
collected_users = []
if gather_retweets:
# We're gathering retweet ids
covid_filepath = "covid"
else:
# We're gathering retweets themselves
covid_filepath = "covid_retweets"
tweet_filepath_temp = f"{covid_filepath}/tweets/"
user_filepath_temp = f"{covid_filepath}/users/"
retweet_filepath_temp = f"{covid_filepath}/retweets/"
# Take the ceil to process any remaining tweet ids
steps = int(len(covid_tweets_ids)/STEP_SIZE) + 1
try:
for i in range(steps):
tweets = twitter_client.retrieve_tweets_by_ids(ids=covid_tweets_ids[i*STEP_SIZE:(i+1)*STEP_SIZE])
included_users = tweets.includes.get('users', [])
collected_users += included_users
for tweet in tweets.data:
processed_tweet, tweet_topic_domains, tweet_topic_entities = format_context_annotations(tweet.data)
collected_tweets.append(processed_tweet)
topic_domains.update(tweet_topic_domains)
topic_entities.update(tweet_topic_entities)
except TooManyRequests:
# Reached API limit
print(f"Hit Rate Limit, processed {i * STEP_SIZE}")
print(f'tweets left: {len(covid_tweets_ids) - (i * STEP_SIZE)}')
finally:
# Dump all to parquet and keep track at which user we stopped.
if len(collected_tweets) > 0:
# Append end tweet id for this iteration to end of filename
first_processed_tweet_id = collected_tweets[0]['id']
last_processed_tweet_id = collected_tweets[-1]['id']
tweet_filename = f"{first_processed_tweet_id}-to-{last_processed_tweet_id}.parquet.gzip"
tweet_filepath = directory + tweet_filepath_temp + tweet_filename
os.makedirs(os.path.dirname(tweet_filepath), exist_ok=True)
format_tweets_df(collected_tweets).to_parquet(tweet_filepath, compression="gzip", index=False)
user_filepath = directory + user_filepath_temp + tweet_filename
os.makedirs(os.path.dirname(user_filepath), exist_ok=True)
format_users_df([user.data for user in collected_users]).to_parquet(user_filepath, compression="gzip", index=False)
if gather_retweets:
# Check if tweet has referenced tweets
retweeted = [tweet for tweet in collected_tweets if tweet.get('referenced_tweets')]
# Retrieve all referenced tweets ids in the tweet
referenced_tweets_ids = set([referenced_tweet['id'] for tweet in retweeted for referenced_tweet in tweet['referenced_tweets'] if referenced_tweet['type'] == 'retweeted'])
retweet_filepath = directory + retweet_filepath_temp + tweet_filename
os.makedirs(os.path.dirname(retweet_filepath), exist_ok=True)
pd.DataFrame(referenced_tweets_ids, columns=['id']).to_parquet(retweet_filepath, compression="gzip", index=False)
# Save the topics encountered so far as pickle file
with open(f'{directory}topic_domains.pickle', 'wb') as handle:
pickle.dump(topic_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'{directory}topic_entities.pickle', 'wb') as handle:
pickle.dump(topic_entities, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Update the tweets ids to remove the ones already processed
if len(covid_tweets_ids) < 100:
pd.DataFrame([], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
else:
pd.DataFrame(covid_tweets_ids[(i*STEP_SIZE):], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
if (push_to_remote):
s3 = boto3.resource("s3")
bucket_name = "semester-project-twitter-storage"
# Upload to S3
bucket = s3.Bucket(bucket_name)
bucket.upload_file(tweet_filepath, f"{tweet_filepath_temp}{tweet_filename}")
bucket.upload_file(user_filepath, f"{user_filepath_temp}{tweet_filename}")
if gather_retweets:
bucket.upload_file(retweet_filepath, f"{retweet_filepath_temp}{tweet_filename}")
else:
print("Finished processing users")
return
def main():
# TODO: Change depending on whether you're executing this script locally or on a remote server (possibly with s3 access)
LOCAL = False
if LOCAL:
DIRECTORY = ""
with open("api_key.yaml", 'rt') as file:
secret = yaml.safe_load(file)
BEARER_TOKEN = secret['Bearer Token']
PUSH_TO_REMOTE = False
else:
DIRECTORY="/home/ubuntu/covid_tweets/"
BEARER_TOKEN = os.environ["BearerToken"]
PUSH_TO_REMOTE = True
# Authenticate to Twitter
client_wrapper = TwitterClientWrapper(BEARER_TOKEN, wait_on_rate_limit=False)
covid_ids = list(pd.read_parquet(f"{DIRECTORY}{COVID_IDS_PATH}").id)
if len(covid_ids) != 0:
run(client_wrapper, DIRECTORY, covid_tweets_ids=covid_ids, gather_retweets=False, push_to_remote=PUSH_TO_REMOTE)
if __name__ == "__main__":
main() |