himym-analysis / utils /data_loader.py
akshatsanghvi's picture
Update data_loader.py
5ba7c45
raw
history blame contribute delete
756 Bytes
import pandas as pd
from glob import glob
def load_subs(path):
path = path + "/*.srt"
subs = glob(path)
episodes = []
scripts = []
for sub in subs:
with open(sub, "r", encoding="utf-8") as f:
lines = f.readlines()
con = []
for line in lines:
line = line.strip().replace("Sync", "").replace("vNaru", "")
if line.isnumeric() or "-->" in line:
continue
else:
con.append(line)
script = " ".join(con)
epno = int(sub.split("-")[1].strip()[-1])
episodes.append(epno)
scripts.append(script)
df = pd.DataFrame({"episode": episodes, "script": scripts})
return df