File size: 5,985 Bytes
cbce622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
from tweepy import TooManyRequests
import os
import pandas as pd
import pickle
import yaml
import boto3

from helper.twitter_client_wrapper import (
    format_tweets_df, format_users_df, format_context_annotations,
    load_topic_domains, load_topic_entities, TwitterClientWrapper
)

COVID_IDS_PATH = "covid_ids.parquet.gzip"
STEP_SIZE = 100

def run(twitter_client, directory, covid_tweets_ids, gather_retweets=True, push_to_remote=True):
    topic_domains = load_topic_domains(f'{directory}topic_domains.pickle')
    topic_entities = load_topic_entities(f'{directory}topic_entities.pickle')

    # List where we accumulate the tweets retrieved so far
    collected_tweets = []
    # List where we accumulate the users retrieved so far
    collected_users = []
    if gather_retweets:
        # We're gathering retweet ids
        covid_filepath = "covid"
    else:
        # We're gathering retweets themselves
        covid_filepath = "covid_retweets"
    tweet_filepath_temp = f"{covid_filepath}/tweets/"
    user_filepath_temp = f"{covid_filepath}/users/"
    retweet_filepath_temp = f"{covid_filepath}/retweets/"

    # Take the ceil to process any remaining tweet ids
    steps = int(len(covid_tweets_ids)/STEP_SIZE) + 1

    try:
        for i in range(steps):
            tweets = twitter_client.retrieve_tweets_by_ids(ids=covid_tweets_ids[i*STEP_SIZE:(i+1)*STEP_SIZE])
            included_users = tweets.includes.get('users', [])
            collected_users += included_users
            for tweet in tweets.data:
                processed_tweet, tweet_topic_domains, tweet_topic_entities = format_context_annotations(tweet.data)
                collected_tweets.append(processed_tweet)
                topic_domains.update(tweet_topic_domains)
                topic_entities.update(tweet_topic_entities)
    except TooManyRequests:
        # Reached API limit
        print(f"Hit Rate Limit, processed {i * STEP_SIZE}")
        print(f'tweets left: {len(covid_tweets_ids) - (i * STEP_SIZE)}')
    finally:
        # Dump all to parquet and keep track at which user we stopped.
        if len(collected_tweets) > 0:
            # Append end tweet id for this iteration to end of filename
            first_processed_tweet_id = collected_tweets[0]['id']
            last_processed_tweet_id = collected_tweets[-1]['id']
            tweet_filename = f"{first_processed_tweet_id}-to-{last_processed_tweet_id}.parquet.gzip"
            tweet_filepath = directory + tweet_filepath_temp + tweet_filename
            os.makedirs(os.path.dirname(tweet_filepath), exist_ok=True)
            format_tweets_df(collected_tweets).to_parquet(tweet_filepath, compression="gzip", index=False)

            user_filepath = directory + user_filepath_temp + tweet_filename
            os.makedirs(os.path.dirname(user_filepath), exist_ok=True)
            format_users_df([user.data for user in collected_users]).to_parquet(user_filepath, compression="gzip", index=False)

            if gather_retweets:
                # Check if tweet has referenced tweets
                retweeted = [tweet for tweet in collected_tweets if tweet.get('referenced_tweets')]
                # Retrieve all referenced tweets ids in the tweet
                referenced_tweets_ids = set([referenced_tweet['id'] for tweet in retweeted for referenced_tweet in tweet['referenced_tweets'] if referenced_tweet['type'] == 'retweeted'])
                retweet_filepath = directory + retweet_filepath_temp + tweet_filename
                os.makedirs(os.path.dirname(retweet_filepath), exist_ok=True)
                pd.DataFrame(referenced_tweets_ids, columns=['id']).to_parquet(retweet_filepath, compression="gzip", index=False)

            # Save the topics encountered so far as pickle file
            with open(f'{directory}topic_domains.pickle', 'wb') as handle:
                pickle.dump(topic_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)

            with open(f'{directory}topic_entities.pickle', 'wb') as handle:
                pickle.dump(topic_entities, handle, protocol=pickle.HIGHEST_PROTOCOL)

            # Update the tweets ids to remove the ones already processed
            if len(covid_tweets_ids) < 100:
                pd.DataFrame([], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)
            else:
                pd.DataFrame(covid_tweets_ids[(i*STEP_SIZE):], columns=['id']).to_parquet(f"{directory}{COVID_IDS_PATH}", index=False)

            if (push_to_remote):
                s3 = boto3.resource("s3")
                bucket_name = "semester-project-twitter-storage"
                # Upload to S3
                bucket = s3.Bucket(bucket_name)
                bucket.upload_file(tweet_filepath, f"{tweet_filepath_temp}{tweet_filename}")
                bucket.upload_file(user_filepath, f"{user_filepath_temp}{tweet_filename}")
                if gather_retweets:
                    bucket.upload_file(retweet_filepath, f"{retweet_filepath_temp}{tweet_filename}")
        else:
            print("Finished processing users")

        return

def main():
    # TODO: Change depending on whether you're executing this script locally or on a remote server (possibly with s3 access)
    LOCAL = False
    
    if LOCAL:
        DIRECTORY = ""
        with open("api_key.yaml", 'rt') as file:
            secret = yaml.safe_load(file)
        BEARER_TOKEN = secret['Bearer Token']
        PUSH_TO_REMOTE = False
    else:
        DIRECTORY="/home/ubuntu/covid_tweets/"
        BEARER_TOKEN = os.environ["BearerToken"]
        PUSH_TO_REMOTE = True
    
    # Authenticate to Twitter
    client_wrapper = TwitterClientWrapper(BEARER_TOKEN, wait_on_rate_limit=False)

    covid_ids = list(pd.read_parquet(f"{DIRECTORY}{COVID_IDS_PATH}").id)

    if len(covid_ids) != 0:
        run(client_wrapper, DIRECTORY, covid_tweets_ids=covid_ids, gather_retweets=False, push_to_remote=PUSH_TO_REMOTE)

if __name__ == "__main__":
    main()