File size: 4,210 Bytes
22e1b62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import argparse
import time

import polars as pl
import requests


def call_api(param):
    url = "https://api.pullpush.io/reddit/search/submission/"
    response = requests.get(url, params=param)
    json_data = response.json()["data"]
    create_utc = []
    media_id = []
    media_type_ls = []
    post_ids = []
    post_titles = []
    cur_utc = 0
    for submission in json_data:
        cur_flair = submission["link_flair_text"]
        cur_utc = submission["created_utc"]
        media_ls = (
            submission["media_metadata"]
            if "media_metadata" in submission.keys()
            else None
        )
        if param["flair"] is not None and cur_flair != param["flair"]:
            continue
        if media_ls is None:
            continue
        for id in media_ls.keys():
            if media_ls[id]["status"] != "valid":
                continue
            try:
                media_type = media_ls[id]["m"]
            except:  # noqa
                # video will error out
                continue
            if media_type == "image/png":
                media_type_ls.append("png")
            elif media_type == "image/jpg":
                media_type_ls.append("jpg")
            else:
                continue
            create_utc.append(int(cur_utc))
            post_ids.append(submission["id"])
            post_titles.append(submission["title"])
            media_id.append(id)

    df = pl.DataFrame(
        {
            "create_utc": create_utc,
            "media_id": media_id,
            "media_type": media_type_ls,
            "post_id": post_ids,
            "post_title": post_titles,
        },
        schema={
            "create_utc": pl.Int64,
            "media_id": pl.Utf8,
            "media_type": pl.Utf8,
            "post_id": pl.Utf8,
            "post_title": pl.Utf8,
        },
    )
    return df, int(cur_utc)


def scraping_loop(
    subreddit,
    flair,
    max_num=30000,
    output_name=None,
    before=None,
):
    collected_all = []
    collected_len = 0
    last_timestamp = int(time.time()) if before is None else before
    param = {
        "subreddit": subreddit,
        "flair": flair,
        "before": last_timestamp,
    }
    while collected_len < max_num:
        collected_df, last_timestamp = call_api(param)
        if collected_df.shape[0] == 0:
            print("No more data, saving current data and exiting...")
            break
        collected_all.append(collected_df)
        collected_len += collected_df.shape[0]
        print(
            f"collected_len: {collected_len}, "
            f"last_timestamp: {last_timestamp}",
        )
        param["before"] = last_timestamp

    df = pl.concat(collected_all)
    df = (
        df.with_columns(
            pl.col("media_id")
            .str.replace(r"^", "https://i.redd.it/")
            .alias("url1"),
            pl.col("create_utc")
            .cast(pl.Int64)
            .cast(pl.Utf8)
            .str.to_datetime("%s")
            .alias("time"),
        )
        .with_columns(
            pl.col("media_type").str.replace(r"^", ".").alias("url2"),
        )
        .with_columns(
            pl.concat_str(
                [pl.col("url1"), pl.col("url2")],
                separator="",
            ).alias("url"),
        )
        .select("time", "url", "post_id", "post_title")
    )
    if output_name is None:
        output_name = subreddit
    df.write_parquet(f"urls/{output_name}.parquet")
    df.select("url").write_csv(f"urls/{output_name}.csv", has_header=False)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--subreddit", help="subreddit name")
    parser.add_argument("--flair", help="flair filter", default=None, type=str)
    parser.add_argument(
        "--max_num",
        help="max number of posts to scrape",
        default=30000,
        type=int,
    )
    parser.add_argument(
        "--output_name",
        help="custom output name",
        default=None,
    )
    parser.add_argument(
        "--before",
        help="before timestamp",
        default=None,
        type=int,
    )

    args = parser.parse_args()

    scraping_loop(**args.__dict__)