File size: 4,201 Bytes
fe42b4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import requests
import re

# Streamlit 제λͺ© 및 μ„€λͺ…
st.title("VOD μ±„νŒ… 크둀러")
st.write("VOD URL을 μž…λ ₯ν•˜κ³  μ±„νŒ… 데이터λ₯Ό ν¬λ‘€λ§ν•©λ‹ˆλ‹€.")

# URL μž…λ ₯ λ°›κΈ°
vod_url = st.text_input("VOD URL μž…λ ₯")


# μ±„νŒ… 크둀링 ν•¨μˆ˜
def crawl_chats(vod_url):
    # URL μ„€μ •
    url = vod_url + "/chats"

    # μš”μ²­ 헀더 (ν•„μš”μ‹œ μΏ ν‚€λ‚˜ 기타 헀더 정보 μΆ”κ°€)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Referer": "https://chzzk.naver.com/video/3646597",
        "Origin": "https://chzzk.naver.com",
        "Cookie": "your-cookie-string-here"  # 둜그인 ν›„ 개발자 λ„κ΅¬μ˜ Request Headers νƒ­μ—μ„œ μΏ ν‚€ κ°’ 볡사
    }

    # 초기 μš”μ²­ νŒŒλΌλ―Έν„° μ„€μ •
    params = {
        "playerMessageTime": 0,  # μ‹œμž‘ μ‹œμ  (0 = 0초)
        "previousVideoChatSize": 50  # κ°€μ Έμ˜¬ μ±„νŒ… λ©”μ‹œμ§€ 수
    }

    # λ§ˆμ§€λ§‰ μˆ˜μ§‘ν•œ μ±„νŒ… λ©”μ‹œμ§€ μ €μž₯ λ³€μˆ˜
    last_collected_chats = None
    total_collected_chats = 0
    chat_logs = []

    # μ±„νŒ… 데이터λ₯Ό 순차적으둜 μš”μ²­ν•˜μ—¬ κ°€μ Έμ˜€κΈ°
    while True:
        # API μš”μ²­ 보내기
        response = requests.get(url, params=params, headers=headers)

        # μš”μ²­ κ²°κ³Όκ°€ 성곡적이지 μ•Šμ„ 경우 μ’…λ£Œ
        if response.status_code != 200:
            return f"API μš”μ²­ μ‹€νŒ¨: {response.status_code}"

        # JSON λ°μ΄ν„°λ‘œ λ³€ν™˜
        data = response.json()

        # μ±„νŒ… λ©”μ‹œμ§€κ°€ ν¬ν•¨λœ 'videoChats' λ°°μ—΄ κ°€μ Έμ˜€κΈ°
        chats = data.get("content", {}).get("videoChats", [])

        # μ±„νŒ… λ©”μ‹œμ§€κ°€ μ—†λ‹€λ©΄ μ’…λ£Œ
        if not chats:
            break

        # ν˜„μž¬ μˆ˜μ§‘λœ μ±„νŒ…μ΄ 이전에 μˆ˜μ§‘λœ μ±„νŒ…κ³Ό λ™μΌν•œμ§€ 확인
        if last_collected_chats == chats:
            break

        # μ±„νŒ… λ©”μ‹œμ§€(content)와 playerMessageTime(μ±„νŒ… μ‹œκ°„)만 μˆ˜μ§‘
        for chat in chats:
            chat_content = chat.get("content")  # μ±„νŒ… λ©”μ‹œμ§€ λ‚΄μš©
            message_time = chat.get("playerMessageTime")  # μ±„νŒ… λ©”μ‹œμ§€ μž…λ ₯ μ‹œκ°„

            # 순수 ν…μŠ€νŠΈλ§Œ ν¬ν•¨λœ μ±„νŒ… λ©”μ‹œμ§€ 필터링
            if not re.search(r'{:[^}]*:}', chat_content):  # 이λͺ¨ν‹°μ½˜ ν˜•μ‹μ˜ λ©”μ‹œμ§€κ°€ 없을 λ•Œλ§Œ μˆ˜μ§‘
                # λ°€λ¦¬μ΄ˆ μ‹œκ°„μ„ "μ‹œκ°„:λΆ„:초" ν˜•μ‹μœΌλ‘œ λ³€ν™˜
                hours, remainder = divmod(message_time // 1000, 3600)
                minutes, seconds = divmod(remainder, 60)

                if hours > 0:
                    formatted_time = f"{hours:02}:{minutes:02}:{seconds:02}"  # "HH:MM:SS" ν˜•μ‹
                else:
                    formatted_time = f"{minutes:02}:{seconds:02}"  # "MM:SS" ν˜•μ‹

                # μ±„νŒ… λ‘œκ·Έμ— μΆ”κ°€
                chat_logs.append(f"{formatted_time} - {chat_content}")

        # λ§ˆμ§€λ§‰μœΌλ‘œ μˆ˜μ§‘ν•œ μ±„νŒ… λ©”μ‹œμ§€ μ—…λ°μ΄νŠΈ
        last_collected_chats = chats

        # μ±„νŒ… 데이터 개수 μ—…λ°μ΄νŠΈ 및 좜λ ₯
        total_collected_chats += len(chats)

        # λ‹€μŒ μš”μ²­μ„ μœ„ν•΄ playerMessageTime νŒŒλΌλ―Έν„° μ—…λ°μ΄νŠΈ
        next_time = data["content"].get("nextPlayerMessageTime")
        if next_time is None:
            break
        params["playerMessageTime"] = next_time

    # κ²°κ³Ό λ°˜ν™˜
    return "\n".join(chat_logs)


# λ²„νŠΌμ„ λˆŒλ €μ„ λ•Œ μ±„νŒ… 크둀링 μ‹œμž‘
if st.button("크둀링 μ‹œμž‘"):
    if vod_url:
        chat_logs = crawl_chats(vod_url)
        st.text_area("μ±„νŒ… 둜그", value=chat_logs, height=400)

        # 파일둜 μ €μž₯
        file_name = "chat_logs.txt"
        with open(file_name, "w") as file:
            file.write(chat_logs)

        # λ‹€μš΄λ‘œλ“œ λ²„νŠΌ ν‘œμ‹œ
        with open(file_name, "rb") as file:
            btn = st.download_button(
                label="μ±„νŒ… 둜그 λ‹€μš΄λ‘œλ“œ",
                data=file,
                file_name=file_name,
                mime="text/plain"
            )
    else:
        st.warning("URL을 μž…λ ₯ν•˜μ„Έμš”.")