|
""" |
|
This is a utility script for use in sagemaker |
|
""" |
|
|
|
import json |
|
import pandas as pd |
|
import pyarrow as pa |
|
import pyarrow.parquet as pq |
|
import os |
|
from tqdm import tqdm |
|
|
|
|
|
json_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/arxiv-metadata-oai-snapshot.json" |
|
parquet_file_path = "/home/studio-lab-user/arxiv-paper-recommender-system/data/processed/arxiv_papers_raw.parquet.gzip" |
|
|
|
|
|
batch_size = 10000 |
|
|
|
|
|
parent_dir = os.path.dirname(parquet_file_path) |
|
os.makedirs(parent_dir, exist_ok=True) |
|
|
|
|
|
with open(json_file_path, 'r') as file: |
|
|
|
arxiv_data = [] |
|
processed_count = 0 |
|
|
|
|
|
for line in tqdm(file): |
|
|
|
arxiv_data.append(json.loads(line)) |
|
|
|
processed_count += 1 |
|
|
|
|
|
if processed_count % batch_size == 0: |
|
df = pd.DataFrame.from_records(arxiv_data) |
|
|
|
|
|
|
|
table = pa.Table.from_pandas(df) |
|
|
|
|
|
pq.write_to_dataset(table , root_path=parquet_file_path) |
|
arxiv_data = [] |
|
|
|
|
|
if arxiv_data: |
|
df = pd.DataFrame.from_records(arxiv_data) |
|
|
|
|
|
pq.write_to_dataset(parquet_file_path , root_path=parquet_file_path) |
|
|