{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pip install pyspark" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "current_directory = os.getcwd()\n", "print(current_directory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.chdir(current_directory)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import col\n", "\n", "spark = SparkSession.builder.appName(\"CombineParquetFiles\").config(\"spark.executor.memory\", \"8g\").config(\"spark.executor.cores\", \"4\").config(\"spark.executor.instances\", \"3\").config(\"spark.dynamicAllocation.enabled\", \"true\").config(\"spark.task.maxFailures\", 10).getOrCreate()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parquet_directory_path = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/laion_sampled'\n", "\n", "output_parquet_file = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/laion_combined'\n", "\n", "df = spark.read.parquet(parquet_directory_path)\n", "\n", "df_coalesced = df.coalesce(1)\n", "\n", "df_coalesced.write.mode('overwrite').parquet(output_parquet_file)\n", "\n", "row_count = df.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(row_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parquet_directory_path = '/Users/fionachow/Documents/NYU/CDS/Fall 2023/CSCI - GA 2271 - Computer Vision/Project/laion_combined/part-00000-0190eea7-02ac-4ea0-86fd-0722308c0c58-c000.snappy.parquet'\n", "\n", "df = spark.read.parquet(parquet_directory_path)\n", "\n", "df.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(df.count())" ] } ], "metadata": { "kernelspec": { "display_name": "bloom", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 2 }