VTVTB / app.py
lcjln's picture
Update app.py
1dae269 verified
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import datetime
import re
import requests
import yt_dlp
import os
import time
# νŽ˜μ΄μ§€ ꡬ성을 'wide'둜 μ„€μ •ν•˜μ—¬ μ—¬λ°± μ΅œμ†Œν™”
st.set_page_config(layout="wide")
# Streamlit 제λͺ© 및 μ„€λͺ…
st.title("VOD μ±„νŒ… 크둀러")
st.write("VOD URL을 μž…λ ₯ν•˜κ³  μ±„νŒ… 데이터λ₯Ό ν¬λ‘€λ§ν•©λ‹ˆλ‹€.")
# URL μž…λ ₯ λ°›κΈ°
vod_url = st.text_input("VOD URL μž…λ ₯")
# μ„ νƒλœ μ‹œκ°„λŒ€λ₯Ό μ €μž₯ν•  곡간
if 'selected_times' not in st.session_state:
st.session_state['selected_times'] = []
# μ±„νŒ… 크둀링 ν•¨μˆ˜
def crawl_chats(vod_url):
# URL μ„€μ •
url = vod_url + "/chats"
# μš”μ²­ 헀더 (ν•„μš”μ‹œ μΏ ν‚€λ‚˜ 기타 헀더 정보 μΆ”κ°€)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Referer": "https://chzzk.naver.com/video/3646597",
"Origin": "https://chzzk.naver.com",
"Cookie": "your-cookie-string-here" # 둜그인 ν›„ 개발자 λ„κ΅¬μ˜ Request Headers νƒ­μ—μ„œ μΏ ν‚€ κ°’ 볡사
}
# 초기 μš”μ²­ νŒŒλΌλ―Έν„° μ„€μ •
params = {
"playerMessageTime": 0, # μ‹œμž‘ μ‹œμ  (0 = 0초)
"previousVideoChatSize": 50 # κ°€μ Έμ˜¬ μ±„νŒ… λ©”μ‹œμ§€ 수
}
chat_logs = []
chat_counts = defaultdict(int)
laugh_counts = defaultdict(int)
total_chats_collected = 0 # 총 μˆ˜μ§‘λœ μ±„νŒ… 개수
start_time = time.time() # 크둀링 μ‹œμž‘ μ‹œκ°„
# μ±„νŒ… 데이터λ₯Ό 순차적으둜 μš”μ²­ν•˜μ—¬ κ°€μ Έμ˜€κΈ°
status_text = st.empty() # μƒνƒœ λ©”μ‹œμ§€ 좜λ ₯용
while True:
# API μš”μ²­ 보내기
response = requests.get(url, params=params, headers=headers)
# μš”μ²­ κ²°κ³Όκ°€ 성곡적이지 μ•Šμ„ 경우 μ’…λ£Œ
if response.status_code != 200:
return f"API μš”μ²­ μ‹€νŒ¨: {response.status_code}", None, None
# JSON λ°μ΄ν„°λ‘œ λ³€ν™˜
data = response.json()
# μ±„νŒ… λ©”μ‹œμ§€κ°€ ν¬ν•¨λœ 'videoChats' λ°°μ—΄ κ°€μ Έμ˜€κΈ°
chats = data.get("content", {}).get("videoChats", [])
# μ±„νŒ… λ©”μ‹œμ§€κ°€ μ—†λ‹€λ©΄ μ’…λ£Œ
if not chats:
break
# μ±„νŒ… λ©”μ‹œμ§€(content)와 playerMessageTime(μ±„νŒ… μ‹œκ°„)만 μˆ˜μ§‘
for chat in chats:
chat_content = chat.get("content") # μ±„νŒ… λ©”μ‹œμ§€ λ‚΄μš©
message_time = chat.get("playerMessageTime") # μ±„νŒ… λ©”μ‹œμ§€ μž…λ ₯ μ‹œκ°„
# 순수 ν…μŠ€νŠΈλ§Œ ν¬ν•¨λœ μ±„νŒ… λ©”μ‹œμ§€ 필터링
if not re.search(r'{:[^}]*:}', chat_content): # 이λͺ¨ν‹°μ½˜ ν˜•μ‹μ˜ λ©”μ‹œμ§€κ°€ 없을 λ•Œλ§Œ μˆ˜μ§‘
# λ°€λ¦¬μ΄ˆ μ‹œκ°„μ„ "μ‹œκ°„:λΆ„:초" ν˜•μ‹μœΌλ‘œ λ³€ν™˜
hours, remainder = divmod(message_time // 1000, 3600)
minutes, seconds = divmod(remainder, 60)
if hours > 0:
formatted_time = f"{hours:02}:{minutes:02}:{seconds:02}" # "HH:MM:SS" ν˜•μ‹
else:
formatted_time = f"{minutes:02}:{seconds:02}" # "MM:SS" ν˜•μ‹
# μ±„νŒ… λ‘œκ·Έμ— μΆ”κ°€
chat_logs.append(f"{formatted_time} - {chat_content}")
# μ‹œκ°„λŒ€λ³„λ‘œ μ±„νŒ… 개수 계산
try:
# HH:MM:SS ν˜•μ‹μΌ 경우 처리
time_obj = datetime.datetime.strptime(formatted_time, '%H:%M:%S')
except ValueError:
# MM:SS ν˜•μ‹μΌ 경우 처리
time_obj = datetime.datetime.strptime(formatted_time, '%M:%S')
minute_key = time_obj.replace(second=0) # λΆ„ λ‹¨μœ„λ‘œ λ³€ν™˜ν•˜μ—¬ 집계
chat_counts[minute_key] += 1
# 'γ…‹γ…‹γ…‹γ…‹'κ°€ ν¬ν•¨λœ μ±„νŒ… 개수 카운트
if len(re.findall(r'γ…‹', chat_content)) >= 4:
laugh_counts[minute_key] += 1
total_chats_collected += len(chats)
elapsed_time = time.time() - start_time # κ²½κ³Ό μ‹œκ°„ 계산
status_text.text(f"ν˜„μž¬κΉŒμ§€ μˆ˜μ§‘λœ μ±„νŒ… λ©”μ‹œμ§€ 개수: {total_chats_collected} | κ²½κ³Ό μ‹œκ°„: {int(elapsed_time // 60)}λΆ„ {int(elapsed_time % 60)}초")
# λ‹€μŒ μš”μ²­μ„ μœ„ν•΄ playerMessageTime νŒŒλΌλ―Έν„° μ—…λ°μ΄νŠΈ
next_time = data["content"].get("nextPlayerMessageTime")
if next_time is None:
break
params["playerMessageTime"] = next_time
return "\n".join(chat_logs), chat_counts, laugh_counts
# ν΄λ¦­ν•œ μ‹œκ°„λŒ€λ₯Ό 선택 및 좜λ ₯
def add_selected_time(time):
if time not in st.session_state['selected_times']:
st.session_state['selected_times'].append(time)
# μ„ νƒλœ μ‹œκ°„λŒ€λ₯Ό ν‘œμ‹œ 및 μ‚­μ œ κΈ°λŠ₯
def display_selected_times():
if st.session_state['selected_times']:
st.write("### μ„ νƒλœ μ‹œκ°„λŒ€")
for time in st.session_state['selected_times']:
col1, col2 = st.columns([9, 1])
col1.write(f"{time}")
if col2.button("X", key=f"remove_{time}"):
st.session_state['selected_times'].remove(time)
# yt-dlpλ₯Ό μ‚¬μš©ν•˜μ—¬ μ„ νƒλœ μ‹œκ°„λŒ€μ˜ μ˜μƒμ„ λ‹€μš΄λ‘œλ“œ
def download_clips():
if st.session_state['selected_times']:
st.write("### μ˜μƒ λ‹€μš΄λ‘œλ“œ")
for idx, start_time in enumerate(st.session_state['selected_times']):
start_time_obj = datetime.datetime.strptime(start_time, '%H:%M:%S')
end_time_obj = start_time_obj + datetime.timedelta(minutes=1)
# μ‹œμž‘ μ‹œκ°„κ³Ό 끝 μ‹œκ°„μ„ HH:MM:SS ν˜•μ‹μœΌλ‘œ λ³€ν™˜
start_time_str = start_time_obj.strftime('%H:%M:%S')
end_time_str = end_time_obj.strftime('%H:%M:%S')
# yt-dlp λ‹€μš΄λ‘œλ“œ λͺ…λ Ήμ–΄ μ‹€ν–‰
output_filename = f"clip_{idx + 1}.mp4"
ydl_opts = {
'outtmpl': output_filename,
'download_sections': [f"*{start_time_str}-{end_time_str}"]
}
st.write(f"λ‹€μš΄λ‘œλ“œ 쀑: {start_time_str} ~ {end_time_str} | 파일λͺ…: {output_filename}")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([vod_url])
# λ²„νŠΌμ„ λˆŒλ €μ„ λ•Œ μ±„νŒ… 크둀링 μ‹œμž‘
if 'chat_logs' not in st.session_state:
st.session_state['chat_logs'] = None
if 'chat_counts' not in st.session_state:
st.session_state['chat_counts'] = None
if 'laugh_counts' not in st.session_state:
st.session_state['laugh_counts'] = None
if st.button("크둀링 μ‹œμž‘"):
if vod_url:
chat_logs, chat_counts, laugh_counts = crawl_chats(vod_url)
st.session_state['chat_logs'] = chat_logs
st.session_state['chat_counts'] = chat_counts
st.session_state['laugh_counts'] = laugh_counts
# 파일둜 μ €μž₯
file_name = "chat_logs.txt"
with open(file_name, "w") as file:
file.write(chat_logs)
# λ‹€μš΄λ‘œλ“œ λ²„νŠΌ ν‘œμ‹œ
with open(file_name, "rb") as file:
st.download_button(
label="μ±„νŒ… 둜그 λ‹€μš΄λ‘œλ“œ",
data=file,
file_name=file_name,
mime="text/plain"
)
else:
st.warning("URL을 μž…λ ₯ν•˜μ„Έμš”.")
# 이전에 ν¬λ‘€λ§ν•œ κ²°κ³Όκ°€ 있으면 κ·Έλž˜ν”„μ™€ λ‹€μš΄λ‘œλ“œ λ²„νŠΌ ν‘œμ‹œ
if st.session_state['chat_logs']:
# λ°μ΄ν„°ν”„λ ˆμž„ 생성
times = [time.strftime('%H:%M:%S') for time in st.session_state['chat_counts'].keys()]
chat_numbers = list(st.session_state['chat_counts'].values())
laugh_numbers = [st.session_state['laugh_counts'].get(time, 0) for time in st.session_state['chat_counts'].keys()]
df = pd.DataFrame({'μ‹œκ°„': times, '전체 μ±„νŒ… 개수': chat_numbers, 'γ…‹γ…‹γ…‹γ…‹ μ±„νŒ… 개수': laugh_numbers})
# Plotly μ„  κ·Έλž˜ν”„ 그리기
fig = go.Figure()
# 전체 μ±„νŒ… 개수 μ„  κ·Έλž˜ν”„ μΆ”κ°€
fig.add_trace(go.Scatter(
x=df['μ‹œκ°„'],
y=df['전체 μ±„νŒ… 개수'],
mode='lines',
name='전체 μ±„νŒ… 개수',
line=dict(color='blue'),
hovertemplate='%{x} - 전체 μ±„νŒ… 개수: %{y}<extra></extra>'
))
# γ…‹γ…‹γ…‹γ…‹ μ±„νŒ… 개수 μ„  κ·Έλž˜ν”„ μΆ”κ°€
fig.add_trace(go.Scatter(
x=df['μ‹œκ°„'],
y=df['γ…‹γ…‹γ…‹γ…‹ μ±„νŒ… 개수'],
mode='lines',
name='γ…‹γ…‹γ…‹γ…‹ μ±„νŒ… 개수',
line=dict(color='red'),
hovertemplate='%{x} - γ…‹γ…‹γ…‹γ…‹ μ±„νŒ… 개수: %{y}<extra></extra>'
))
# κ·Έλž˜ν”„ λ ˆμ΄μ•„μ›ƒ μ„€μ •
fig.update_layout(
title="λΆ„λ‹Ή μ±„νŒ… 및 γ…‹γ…‹γ…‹γ…‹ μ±„νŒ… 개수",
xaxis_title="μ‹œκ°„",
yaxis_title="μ±„νŒ… 개수",
height=600, # κ·Έλž˜ν”„ 높이 μ„€μ •
xaxis=dict(showticklabels=False), # xμΆ• μ‹œκ°„ λ ˆμ΄λΈ” μˆ¨κΉ€
hovermode="x unified", # 마우슀λ₯Ό μ˜¬λ Έμ„ λ•Œ ν•΄λ‹Ή xμΆ•μ—μ„œ 툴팁 ν‘œμ‹œ
showlegend=True,
margin=dict(l=50, r=50, t=100, b=100)
)
# κ·Έλž˜ν”„ 좜λ ₯
st.plotly_chart(fig, use_container_width=True)
# κ·Έλž˜ν”„ 클릭 μ‹œ μ‹œκ°„λŒ€ μΆ”κ°€
click_data = st.session_state.get('click_data')
if click_data:
time_selected = click_data['points'][0]['x']
add_selected_time(time_selected)
# μ„ νƒλœ μ‹œκ°„λŒ€ ν‘œμ‹œ
display_selected_times()
# λ‹€μš΄λ‘œλ“œ λ²„νŠΌ ν‘œμ‹œ
if st.button("μ„ νƒλœ μ‹œκ°„λŒ€μ˜ μ˜μƒ λ‹€μš΄λ‘œλ“œ"):
download_clips()