{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "json_tsv.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"authorship_tag": "ABX9TyPr+P8hgcPHRnngFr0h4Cko",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"15dd8cf9a1e74381b0ef14a957ca7619": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_ce76a25c3ba64da89f56142162e672ba",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_f6c8f8b2642449ecb8301ee67af73e0c",
"IPY_MODEL_1924d24a50e3415b9d9e7573a6df986e"
]
}
},
"ce76a25c3ba64da89f56142162e672ba": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f6c8f8b2642449ecb8301ee67af73e0c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_81c1fcda1a5245a6b85e74b2884c302b",
"_dom_classes": [],
"description": "100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 10,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 10,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7f73fd86a243481a8fcfb798fffae3e4"
}
},
"1924d24a50e3415b9d9e7573a6df986e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_2e2ea5bc2838434791009676bdee45c4",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 10/10 [00:09<00:00, 1.03it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d98ab96a83454a998d2c33935ea2fd44"
}
},
"81c1fcda1a5245a6b85e74b2884c302b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"7f73fd86a243481a8fcfb798fffae3e4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2e2ea5bc2838434791009676bdee45c4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d98ab96a83454a998d2c33935ea2fd44": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "6oaNu3HJtTTo"
},
"source": [
"# Parse transcripts to **.tsv** file\n",
"\n",
"## Parse all transcripts for a more seamless experience\n",
"\n",
"Using the **.json** files from each season, create a master file that contain all transcripts in a easier to work with format.\n",
"\n",
"The notebook will create **friends_transcripts.tsv** which contain all seasons and episodes.\n",
"\n",
"This is a sample of the **.tsv** file:\n",
"\n",
"
\n",
"\n",
"|season_id|episode_id|scene_id|utterance_id|speaker|tokens|transcript|\n",
"|:-|:-|:-|:-|:-|:-|:-|\n",
"|0|\ts01|\te01|\tc01|\tu001|\tMonica Geller|\t[[There, 's, nothing, to, tell, !], [He, 's, j...|\tThere's nothing to tell! He's just some guy I ...|\n",
"|1|\ts01|\te01|\tc01|\tu002|\tJoey Tribbiani|\t[[C'mon, ,, you, 're, going, out, with, the, g...|\tC'mon, you're going out with the guy! There's ...|"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vF3UbENTtz2_"
},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"metadata": {
"id": "gjr_J342tOPq"
},
"source": [
"import requests\n",
"import json\n",
"import pandas as pd\n",
"from tqdm.notebook import tqdm"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "OdnKsoFbt1Wd"
},
"source": [
"# Helper Functions"
]
},
{
"cell_type": "code",
"metadata": {
"id": "J9pRq4Szz5eD",
"outputId": "23f792b1-d7b8-4daf-8d3f-32fddb787ee9",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 283,
"referenced_widgets": [
"15dd8cf9a1e74381b0ef14a957ca7619",
"ce76a25c3ba64da89f56142162e672ba",
"f6c8f8b2642449ecb8301ee67af73e0c",
"1924d24a50e3415b9d9e7573a6df986e",
"81c1fcda1a5245a6b85e74b2884c302b",
"7f73fd86a243481a8fcfb798fffae3e4",
"2e2ea5bc2838434791009676bdee45c4",
"d98ab96a83454a998d2c33935ea2fd44"
]
}
},
"source": [
"# define data type\n",
"friends_data = dict(season_id=[],\n",
" episode_id=[],\n",
" scene_id=[],\n",
" utterance_id=[],\n",
" speaker=[],\n",
" tokens=[],\n",
" transcript=[]\n",
" )\n",
"\n",
"# loop through each season\n",
"print('Loading seasons...')\n",
"for season_index in tqdm(range(1, 11)):\n",
" season_index = '0%d'%season_index if season_index <10 else str(season_index)\n",
" # url of json file\n",
" json_url = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/friends_season_%s.json'%season_index\n",
" # get request from url\n",
" request = requests.get(json_url)\n",
" # read seson from json file\n",
" season = json.loads(request.text)\n",
" # get season id\n",
" season_id = season['season_id']\n",
"\n",
" # read each episode\n",
" for episode in season['episodes']:\n",
" episode_id = episode['episode_id']\n",
"\n",
" # read each scene\n",
" for scene in episode['scenes']:\n",
" scene_id = scene['scene_id']\n",
"\n",
" # read each utterance\n",
" for utterance in scene['utterances']:\n",
" utterance_id = utterance['utterance_id']\n",
" speaker = utterance['speakers'][0] if utterance['speakers'] else 'unknown'\n",
" friends_data['season_id'].append(season_id)\n",
" friends_data['episode_id'].append(episode_id.split('_')[-1])\n",
" friends_data['scene_id'].append(scene_id.split('_')[-1])\n",
" friends_data['utterance_id'].append(utterance_id.split('_')[-1])\n",
" friends_data['speaker'].append(speaker)\n",
" friends_data['tokens'].append(utterance['tokens'])\n",
" friends_data['transcript'].append(utterance['transcript'])\n",
"\n",
"# save dicitonary to data frame\n",
"friends_df = pd.DataFrame(friends_data)\n",
"\n",
"# save data frame to .tsv\n",
"friends_df.to_csv('friends_transcripts.tsv', sep='\\t', index=False)\n",
"\n",
"print('File saved in `friends_transcripts.tsv` !')\n",
"# show sample\n",
"friends_df.head()"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"Loading seasons...\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "15dd8cf9a1e74381b0ef14a957ca7619",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n",
"File saved in `/content/friends_transcripts.tsv` !\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n", " | season_id | \n", "episode_id | \n", "scene_id | \n", "utterance_id | \n", "speaker | \n", "tokens | \n", "transcript | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "s01 | \n", "e01 | \n", "c01 | \n", "u001 | \n", "Monica Geller | \n", "[[There, 's, nothing, to, tell, !], [He, 's, j... | \n", "There's nothing to tell! He's just some guy I ... | \n", "
1 | \n", "s01 | \n", "e01 | \n", "c01 | \n", "u002 | \n", "Joey Tribbiani | \n", "[[C'mon, ,, you, 're, going, out, with, the, g... | \n", "C'mon, you're going out with the guy! There's ... | \n", "
2 | \n", "s01 | \n", "e01 | \n", "c01 | \n", "u003 | \n", "Chandler Bing | \n", "[[All, right, Joey, ,, be, nice, .], [So, does... | \n", "All right Joey, be nice. So does he have a hum... | \n", "
3 | \n", "s01 | \n", "e01 | \n", "c01 | \n", "u004 | \n", "Phoebe Buffay | \n", "[[Wait, ,, does, he, eat, chalk, ?]] | \n", "Wait, does he eat chalk? | \n", "
4 | \n", "s01 | \n", "e01 | \n", "c01 | \n", "u005 | \n", "unknown | \n", "[] | \n", "\n", " |