### Named Entity Recognition

In [1]:
import os
import sys
import pathlib
import spacy
from nltk.tokenize import sent_tokenize

folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, "../"))

from utils import load_subs

In [4]:
# !python -m spacy download en_core_web_trf

__Load Model__


In [2]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

nlp_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm
  model.load_state_dict(torch.load(filelike, map_location=device))


__Load Dataset__

In [3]:
dataset_path = "../data/subs/"
df = load_subs(dataset_path)

In [4]:
df.head()

Unnamed: 0,episode,script
0,1,"﻿1 Kids, I'm gonna tell you an incredible stor..."
1,2,"by - Okay, where was I? - You were telling ..."
2,3,S Sy Syn b by by v by vN by vNa ...
3,4,"by Kids, when you're single, all you're loo..."
4,5,"by So, kids, would you like to hear the sto..."


In [8]:
sample_script = df.iloc[0]["script"]
sample_script

'\ufeff1 Kids, I\'m gonna tell you an incredible story ;  the story of how I met your mother.  Are we being punished for something?  No.  Yeah, is this gonna take a while?  Yes. 25 years ago, before I was Dad,  I had this whole other life.  It was way back in 2005.  I was 27, just starting to make it as an architect  and living in New York with Marshall, my best friend from college.  My life was good.  And then Uncle Marshall went and screwed the whole thing up.  Will you marry me?  Yes. Perfect!  And then you\'re engaged. You pop the champagne.  You drink a toast.  You have sex on the kitchen floor.  Don\'t have sex on our kitchen floor.  Got it.  Thanks for helping me plan this out, Ted.  Dude, are you kidding ? It\'s you and Lilly.  I\'ve been there for all the big moments of you and Lilly:  night you met, your first date, other first things.  Yeah, sorry. We thought you were asleep.  It\'s physics, Marshall.  If the bottom bunk moves, the top bunk moves, too.  My God.  You\'re gett

In [9]:
sentences = sent_tokenize(sample_script)

In [19]:
sample_sents = sentences[10:50]
sample_sents[:10]

['Will you marry me?',
 'Yes.',
 'Perfect!',
 "And then you're engaged.",
 'You pop the champagne.',
 'You drink a toast.',
 'You have sex on the kitchen floor.',
 "Don't have sex on our kitchen floor.",
 'Got it.',
 'Thanks for helping me plan this out, Ted.']

In [20]:
sents = ". ".join(sample_sents)
sents

"Will you marry me?. Yes.. Perfect!. And then you're engaged.. You pop the champagne.. You drink a toast.. You have sex on the kitchen floor.. Don't have sex on our kitchen floor.. Got it.. Thanks for helping me plan this out, Ted.. Dude, are you kidding ?. It's you and Lilly.. I've been there for all the big moments of you and Lilly:  night you met, your first date, other first things.. Yeah, sorry.. We thought you were asleep.. It's physics, Marshall.. If the bottom bunk moves, the top bunk moves, too.. My God.. You're getting engaged tonight.. Yeah.. What are you doin' tonight?. What was I doing?. Here Uncle Marshall was taking the biggest step of his life.. And me?. I'm calling up your Uncle Barney.. Hey, so you know how I've always had a thing for half-Asian girls?. Well, now I've got a new favorite... Lebanese girls.. Lebanese girls are the new half-Asians.. Hey, you want to do somethin' tonight?. Okay, meet me at the bar in 15 minutes.. And suit up!. Where's your suit?. Just onc

__Run Model__

In [21]:
doc = nlp_model(sents)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [27]:
doc.ents

(Ted,
 Lilly,
 Lilly,
 night,
 first,
 first,
 Marshall,
 tonight,
 tonight,
 Marshall,
 Barney,
 half-Asian,
 Lebanese,
 Lebanese,
 half-Asians,
 tonight,
 15 minutes,
 one,
 Marshall,
 Lilly,
 Marshall,
 Lilly,
 Ted)

In [34]:
for entity in doc.ents:
    print(entity.text, " > ",entity.label_)

Ted  >  PERSON
Lilly  >  PERSON
Lilly  >  PERSON
night  >  TIME
first  >  ORDINAL
first  >  ORDINAL
Marshall  >  PERSON
tonight  >  TIME
tonight  >  TIME
Marshall  >  PERSON
Barney  >  PERSON
half-Asian  >  NORP
Lebanese  >  NORP
Lebanese  >  NORP
half-Asians  >  NORP
tonight  >  TIME
15 minutes  >  TIME
one  >  CARDINAL
Marshall  >  PERSON
Lilly  >  PERSON
Marshall  >  PERSON
Lilly  >  PERSON
Ted  >  PERSON


In [37]:
def get_chars_inference(script):
    script_sents = sent_tokenize(script)
    chars = [] 

    for sent in script_sents:
        doc = nlp_model(sent)
        char = set()

        for entity in doc.ents:
            if entity.label_ == "PERSON":
                name  = entity.text.strip().split(" ")[0]
                char.add(name)

        chars.append(char)

    return chars

In [38]:
df["chars"] = df["script"].apply(get_chars_inference)

  with torch.cuda.amp.autocast(self._mixed_precision):


In [40]:
df.head()

Unnamed: 0,episode,script,chars
0,1,"﻿1 Kids, I'm gonna tell you an incredible stor...","[{}, {}, {}, {}, {}, {}, {}, {Marshall}, {}, {..."
1,2,"by - Okay, where was I? - You were telling ...","[{}, {Mom}, {}, {}, {}, {Robin}, {}, {Ted, Bar..."
2,3,S Sy Syn b by by v by vN by vNa ...,"[{}, {}, {}, {}, {}, {Marshall, Barney, Lily},..."
3,4,"by Kids, when you're single, all you're loo...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
4,5,"by So, kids, would you like to hear the sto...","[{}, {}, {}, {}, {Robin}, {}, {}, {}, {}, {}, ..."


In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [43]:
def generate_char_network(df):

    windows = 10
    entity_relationship = []

    for row in df["chars"]:
        prev_entity_window = []

        for sentence in row:
            
            # each sentence = ["Ted", "Lilly"]
            prev_entity_window.append(list(sentence))

            # We keep only the last 10 entities as previous.
            prev_entity_window = prev_entity_window[-windows:]

            # Flatten 2D list into 1D list
            prev_entity_flattened = sum(prev_entity_window, [])

            # Build relationship for each entity.
            for entity in sentence:
                # Check each entity with all previous 10 entities.
                for entity_in_window in prev_entity_flattened:

                    # if they aren't same, append them because they are related.
                    if entity != entity_in_window:

                        # Sort them because (ted, lilly is same as lilly, ted.)
                        entity_relationship.append(sorted([entity, entity_in_window]))

    relationship_df = pd.DataFrame({"value": entity_relationship})
    relationship_df["source"] = relationship_df["value"].apply(lambda x: x[0])
    relationship_df["target"] = relationship_df["value"].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(["source", "target"]).count().reset_index()
    relationship_df = relationship_df.sort_values("value", ascending=False)

    return relationship_df

In [145]:
relationship_df = generate_char_network(df)

In [94]:
# with open("char.txt", "w") as f:
#     a = ""
#     for i in sorted(set(relationship_df["source"].unique().tolist()).union(set(relationship_df["target"].unique().tolist()))):
#         a = a + i + "\n"
    
#     f.write(a)

In [131]:
relationship_df.head()

Unnamed: 0,source,target,value
557,Robin,Ted,192
400,Lily,Marshall,108
129,Barney,Ted,107
422,Lily,Ted,86
475,Marshall,Ted,82


In [132]:
relationship_df.tail()

Unnamed: 0,source,target,value
179,Butterfield,Tracy,1
180,C,Marshall,1
379,Laura,Lisa,1
378,Laura,Lily,1
609,Vicky,Victoria,1


In [133]:
relationship_df.shape

(610, 3)

__Correct Dataframe__

In [146]:
corrections = {
    "Lilly" : "Lily",
    "-" : None,
    "Barn" : "Barney",
    "Lil" : "Lily",
    "Lilypad" : "Lily",
    "C" : None,
    "Dad" : None,
    "Daddy" : None,
    "Ding-dong." : None,
    "Dr." : None,
    "Eriksens" : "Marshall",
    "Eriksen" : "Marshall",
    "Guy" : None,
    "Funny" : None,
    "Funyuns" : None,
    "Happy" : None,
    "Lindsay" : "Lily",
    "Lindsey" : "Lily",
    "Love" : None,
    "Marsh" : "Marshall",
    "Marshmallow" : "Marshall",
    "Mom" : None,
    "Moby" : "Ted",
    "Mosby" : "Ted",
    "Mu" : None,
    "Natalya" : "Natalie",
    "Newbie" : None,
    "P.S." : None,
    "Paralegally" : None,
    "Paris" : None,
    "Penelope" : None,
    "Pete" : None,
    "Red" : None,
    "Scherbatsky" : "Robin",
    "Sebastian" : "Robin",
    "Stinson" : "Barney",
    "Aldrin" : "Lily",
    "Stu" : "Stuart",
    "Teddy" : "Ted",
    "Tedder" : "Ted",
    "The" : None,
    "Todd" : "Ted",
    "Tracy" : "Tracey",
    "the" : None
}


relationship_df = relationship_df.replace(corrections).dropna()

In [151]:
relationship_df = relationship_df[relationship_df["source"] != relationship_df["target"]]

In [152]:
with open("char2.txt", "w") as f:
    a = ""
    for i in sorted(set(relationship_df["source"].unique().tolist()).union(set(relationship_df["target"].unique().tolist()))):
        a = a + i + "\n"
    
    f.write(a)

In [153]:
relationship_df = relationship_df.head(200)

In [154]:
G = nx.from_pandas_edgelist(
    relationship_df,
    source="source",
    target="target",
    edge_attr="value",
    create_using=nx.Graph()
)

net = Network(
    notebook=True, width="1000px", 
    height="700px", 
    bgcolor="#222222", 
    font_color="white", 
    cdn_resources="remote"
)

node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, "size")
net.from_nx(G)

net.show("himym.html")

himym.html
