{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "json_tsv.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true, "authorship_tag": "ABX9TyPr+P8hgcPHRnngFr0h4Cko", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "15dd8cf9a1e74381b0ef14a957ca7619": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_ce76a25c3ba64da89f56142162e672ba", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_f6c8f8b2642449ecb8301ee67af73e0c", "IPY_MODEL_1924d24a50e3415b9d9e7573a6df986e" ] } }, "ce76a25c3ba64da89f56142162e672ba": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "f6c8f8b2642449ecb8301ee67af73e0c": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_81c1fcda1a5245a6b85e74b2884c302b", "_dom_classes": [], "description": "100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 10, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 10, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_7f73fd86a243481a8fcfb798fffae3e4" } }, "1924d24a50e3415b9d9e7573a6df986e": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_2e2ea5bc2838434791009676bdee45c4", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 10/10 [00:09<00:00, 1.03it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_d98ab96a83454a998d2c33935ea2fd44" } }, "81c1fcda1a5245a6b85e74b2884c302b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "7f73fd86a243481a8fcfb798fffae3e4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "2e2ea5bc2838434791009676bdee45c4": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "d98ab96a83454a998d2c33935ea2fd44": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "6oaNu3HJtTTo" }, "source": [ "# Parse transcripts to **.tsv** file\n", "\n", "## Parse all transcripts for a more seamless experience\n", "\n", "Using the **.json** files from each season, create a master file that contain all transcripts in a easier to work with format.\n", "\n", "The notebook will create **friends_transcripts.tsv** which contain all seasons and episodes.\n", "\n", "This is a sample of the **.tsv** file:\n", "\n", "
\n", "\n", "|season_id|episode_id|scene_id|utterance_id|speaker|tokens|transcript|\n", "|:-|:-|:-|:-|:-|:-|:-|\n", "|0|\ts01|\te01|\tc01|\tu001|\tMonica Geller|\t[[There, 's, nothing, to, tell, !], [He, 's, j...|\tThere's nothing to tell! He's just some guy I ...|\n", "|1|\ts01|\te01|\tc01|\tu002|\tJoey Tribbiani|\t[[C'mon, ,, you, 're, going, out, with, the, g...|\tC'mon, you're going out with the guy! There's ...|" ] }, { "cell_type": "markdown", "metadata": { "id": "vF3UbENTtz2_" }, "source": [ "# Imports" ] }, { "cell_type": "code", "metadata": { "id": "gjr_J342tOPq" }, "source": [ "import requests\n", "import json\n", "import pandas as pd\n", "from tqdm.notebook import tqdm" ], "execution_count": 2, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "OdnKsoFbt1Wd" }, "source": [ "# Helper Functions" ] }, { "cell_type": "code", "metadata": { "id": "J9pRq4Szz5eD", "outputId": "23f792b1-d7b8-4daf-8d3f-32fddb787ee9", "colab": { "base_uri": "https://localhost:8080/", "height": 283, "referenced_widgets": [ "15dd8cf9a1e74381b0ef14a957ca7619", "ce76a25c3ba64da89f56142162e672ba", "f6c8f8b2642449ecb8301ee67af73e0c", "1924d24a50e3415b9d9e7573a6df986e", "81c1fcda1a5245a6b85e74b2884c302b", "7f73fd86a243481a8fcfb798fffae3e4", "2e2ea5bc2838434791009676bdee45c4", "d98ab96a83454a998d2c33935ea2fd44" ] } }, "source": [ "# define data type\n", "friends_data = dict(season_id=[],\n", " episode_id=[],\n", " scene_id=[],\n", " utterance_id=[],\n", " speaker=[],\n", " tokens=[],\n", " transcript=[]\n", " )\n", "\n", "# loop through each season\n", "print('Loading seasons...')\n", "for season_index in tqdm(range(1, 11)):\n", " season_index = '0%d'%season_index if season_index <10 else str(season_index)\n", " # url of json file\n", " json_url = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/friends_season_%s.json'%season_index\n", " # get request from url\n", " request = requests.get(json_url)\n", " # read seson from json file\n", " season = json.loads(request.text)\n", " # get season id\n", " season_id = season['season_id']\n", "\n", " # read each episode\n", " for episode in season['episodes']:\n", " episode_id = episode['episode_id']\n", "\n", " # read each scene\n", " for scene in episode['scenes']:\n", " scene_id = scene['scene_id']\n", "\n", " # read each utterance\n", " for utterance in scene['utterances']:\n", " utterance_id = utterance['utterance_id']\n", " speaker = utterance['speakers'][0] if utterance['speakers'] else 'unknown'\n", " friends_data['season_id'].append(season_id)\n", " friends_data['episode_id'].append(episode_id.split('_')[-1])\n", " friends_data['scene_id'].append(scene_id.split('_')[-1])\n", " friends_data['utterance_id'].append(utterance_id.split('_')[-1])\n", " friends_data['speaker'].append(speaker)\n", " friends_data['tokens'].append(utterance['tokens'])\n", " friends_data['transcript'].append(utterance['transcript'])\n", "\n", "# save dicitonary to data frame\n", "friends_df = pd.DataFrame(friends_data)\n", "\n", "# save data frame to .tsv\n", "friends_df.to_csv('friends_transcripts.tsv', sep='\\t', index=False)\n", "\n", "print('File saved in `friends_transcripts.tsv` !')\n", "# show sample\n", "friends_df.head()" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "Loading seasons...\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "15dd8cf9a1e74381b0ef14a957ca7619", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "\n", "File saved in `/content/friends_transcripts.tsv` !\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
season_idepisode_idscene_idutterance_idspeakertokenstranscript
0s01e01c01u001Monica Geller[[There, 's, nothing, to, tell, !], [He, 's, j...There's nothing to tell! He's just some guy I ...
1s01e01c01u002Joey Tribbiani[[C'mon, ,, you, 're, going, out, with, the, g...C'mon, you're going out with the guy! There's ...
2s01e01c01u003Chandler Bing[[All, right, Joey, ,, be, nice, .], [So, does...All right Joey, be nice. So does he have a hum...
3s01e01c01u004Phoebe Buffay[[Wait, ,, does, he, eat, chalk, ?]]Wait, does he eat chalk?
4s01e01c01u005unknown[]
\n", "
" ], "text/plain": [ " season_id ... transcript\n", "0 s01 ... There's nothing to tell! He's just some guy I ...\n", "1 s01 ... C'mon, you're going out with the guy! There's ...\n", "2 s01 ... All right Joey, be nice. So does he have a hum...\n", "3 s01 ... Wait, does he eat chalk?\n", "4 s01 ... \n", "\n", "[5 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] } ] }