Spaces:
Running
Running
File size: 4,210 Bytes
22e1b62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import argparse
import time
import polars as pl
import requests
def call_api(param):
url = "https://api.pullpush.io/reddit/search/submission/"
response = requests.get(url, params=param)
json_data = response.json()["data"]
create_utc = []
media_id = []
media_type_ls = []
post_ids = []
post_titles = []
cur_utc = 0
for submission in json_data:
cur_flair = submission["link_flair_text"]
cur_utc = submission["created_utc"]
media_ls = (
submission["media_metadata"]
if "media_metadata" in submission.keys()
else None
)
if param["flair"] is not None and cur_flair != param["flair"]:
continue
if media_ls is None:
continue
for id in media_ls.keys():
if media_ls[id]["status"] != "valid":
continue
try:
media_type = media_ls[id]["m"]
except: # noqa
# video will error out
continue
if media_type == "image/png":
media_type_ls.append("png")
elif media_type == "image/jpg":
media_type_ls.append("jpg")
else:
continue
create_utc.append(int(cur_utc))
post_ids.append(submission["id"])
post_titles.append(submission["title"])
media_id.append(id)
df = pl.DataFrame(
{
"create_utc": create_utc,
"media_id": media_id,
"media_type": media_type_ls,
"post_id": post_ids,
"post_title": post_titles,
},
schema={
"create_utc": pl.Int64,
"media_id": pl.Utf8,
"media_type": pl.Utf8,
"post_id": pl.Utf8,
"post_title": pl.Utf8,
},
)
return df, int(cur_utc)
def scraping_loop(
subreddit,
flair,
max_num=30000,
output_name=None,
before=None,
):
collected_all = []
collected_len = 0
last_timestamp = int(time.time()) if before is None else before
param = {
"subreddit": subreddit,
"flair": flair,
"before": last_timestamp,
}
while collected_len < max_num:
collected_df, last_timestamp = call_api(param)
if collected_df.shape[0] == 0:
print("No more data, saving current data and exiting...")
break
collected_all.append(collected_df)
collected_len += collected_df.shape[0]
print(
f"collected_len: {collected_len}, "
f"last_timestamp: {last_timestamp}",
)
param["before"] = last_timestamp
df = pl.concat(collected_all)
df = (
df.with_columns(
pl.col("media_id")
.str.replace(r"^", "https://i.redd.it/")
.alias("url1"),
pl.col("create_utc")
.cast(pl.Int64)
.cast(pl.Utf8)
.str.to_datetime("%s")
.alias("time"),
)
.with_columns(
pl.col("media_type").str.replace(r"^", ".").alias("url2"),
)
.with_columns(
pl.concat_str(
[pl.col("url1"), pl.col("url2")],
separator="",
).alias("url"),
)
.select("time", "url", "post_id", "post_title")
)
if output_name is None:
output_name = subreddit
df.write_parquet(f"urls/{output_name}.parquet")
df.select("url").write_csv(f"urls/{output_name}.csv", has_header=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--subreddit", help="subreddit name")
parser.add_argument("--flair", help="flair filter", default=None, type=str)
parser.add_argument(
"--max_num",
help="max number of posts to scrape",
default=30000,
type=int,
)
parser.add_argument(
"--output_name",
help="custom output name",
default=None,
)
parser.add_argument(
"--before",
help="before timestamp",
default=None,
type=int,
)
args = parser.parse_args()
scraping_loop(**args.__dict__)
|