In [None]:
import nltk
import torch
import pandas as pd
import numpy as np
from glob import glob
from nltk import sent_tokenize
from transformers import pipeline

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
device = 0 if torch.cuda.is_available() else "cpu"

__Load Model__

In [None]:
model = "facebook/bart-large-mnli"
clf = pipeline("zero-shot-classification", 
 model=model, 
 device=device)

In [None]:
test = "I like your phone, does it even work?"
classes = ["Love", "Appreciation", "Sarcasm", "Anger", "Hunger", "Dialogue"]

In [None]:
clf(test, classes, multi_label=True)

__Load Dataset__

In [None]:
subs = glob("../data/subs/*.srt")
subs[:5]

In [None]:
# Understanding Data.
with open(subs[0], "r", encoding="utf-8") as f:
 con = f.read()
 print(con[:150])

In [None]:
with open(subs[0], "r", encoding="utf-8") as f:
 lines = f.readlines()
 cnt = 0
 con = []
 for line in lines:
 line = line.strip()
 if line.isnumeric() or "-->" in line:
 cnt += 1
 else:
 con.append(line)

print(f"Ignored {cnt} lines out of {len(lines)}. Total lines {len(con)} now.")

In [None]:
# Episode
print(subs[0])
subs[0].split("-")[1].strip()[-1]

In [None]:
def load_subs():
 subs = glob("../data/subs/*.srt")
 episodes = []
 scripts = []

 for sub in subs:
 with open(sub, "r", encoding="utf-8") as f:
 lines = f.readlines()
 cnt = 0
 con = []
 for line in lines:
 line = line.strip()
 if line.isnumeric() or "-->" in line:
 cnt += 1
 else:
 con.append(line)
 
 script = " ".join(con)
 epno = int(sub.split("-")[1].strip()[-1])
 episodes.append(epno)
 scripts.append(script)

 df = pd.DataFrame({"episode": episodes, "script": scripts})
 return df

In [None]:
df = load_subs()

In [None]:
df.head()

__Model Testing__

In [None]:
script = df.iloc[0]["script"]
script

In [None]:
script_sentences = sent_tokenize(script)
script_sentences[:3]

In [None]:
# Batch sentences
sentence_batch_size = 20
script_batches = []

for index in range(0, len(script_sentences), sentence_batch_size):
 script_batches.append("".join(script_sentences[index:index + sentence_batch_size]))

In [None]:
len(script_batches)

In [None]:
theme_output = clf(
 script_batches[:2],
 classes,
 multi_label=True
)

theme_output

In [None]:
themes = {}
for output in theme_output:
 for label, score in zip(output["labels"], output["scores"]):
 if label not in themes:
 themes[label] = []
 themes[label].append(score)

In [None]:
themes

In [None]:
def get_theme_inference(script):

 classes = ["Sarcasm", "Happy", "Friendship", "Vulgar", "Anger", "Dialogue", "Sad", "Love", "Narration"]
 script_sentences = sent_tokenize(script)
 sentence_batch_size = 20
 script_batches = []
 for index in range(0, len(script_sentences), sentence_batch_size):
 script_batches.append("".join(script_sentences[index:index + sentence_batch_size]))

 theme_output = clf(
 script_batches,
 classes,
 multi_label=True
 )

 themes = {}
 for output in theme_output:
 for label, score in zip(output["labels"], output["scores"]):
 if label not in themes:
 themes[label] = []
 themes[label].append(score)
 
 themes = {key:np.mean(np.array(value)) for key, value in themes.items()}
 return themes

In [None]:
opdf = get_theme_inference(script[:500])

In [None]:
opdf = pd.Series(opdf)
opdf

In [None]:
newdf = df.head(1)
newdf[opdf.index] = opdf

In [None]:
newdf

In [None]:
df.head()