import pandas as pd import requests import urllib.parse as urlparse def get_video_id(url_video): """Get video id""" query = urlparse.urlparse(url_video) if query.hostname == 'youtu.be': return query.path[1:] if query.hostname in ('www.youtube.com', 'youtube.com'): if query.path == '/watch': return urlparse.parse_qs(query.query)["v"][0] if query.path[:7] == '/embed/' or query.path[:3] == '/v/': return query.path.split('/')[2] return None def get_comments(api_key, video_id): """Get comments""" endpoint = "https://www.googleapis.com/youtube/v3/commentThreads" params = { "part":"snippet", "videoId": video_id, "maxResults": 100, "key": api_key, } response = requests.get(endpoint, params=params) res = response.json() if "items" in res.keys(): return { num: { "text_comment": " ".join( x["snippet"]["topLevelComment"]["snippet"][ "textOriginal" ].splitlines() ), "publish_data": x["snippet"]["topLevelComment"]["snippet"][ "publishedAt" ], } for num, x in enumerate(res['items']) } def get_sentim(data, headers, url): """Get result of sentimental analysis""" res = requests.post(url, headers=headers, json=data) res = res.json()[0][0] return res['label'], res['score'] def pipeline_sentiment(url_video, api_key, headers, url): """Pipeline of sentimental analysis""" video_id = get_video_id(url_video) comments = get_comments(api_key, video_id) comments_df = pd.DataFrame(comments).T text_tuple = [get_sentim(i, headers, url) for i in comments_df["text_comment"]] comments_df[["sentiment", "score"]] = pd.DataFrame(list(text_tuple)) return comments_df def pipeline_stats(data): """Get statistic of sentiment""" return data['sentiment'].value_counts(normalize=True).mul(100).round(2) def pipeline_summarize(data, headers, url, length=2000, max_length=35): """Get summarization result""" text = " ".join(data) result_text = [] for i in range(0, len(text), length): new_text = text[i : i + length] payload = { "inputs": new_text, "parameters": { "max_length": max_length } } res = requests.post(url, headers=headers, json=payload) result_text.append(res.json()[0]["generated_text"]) return ". ".join(result_text)