Anuj02003 commited on
Commit
88e1fc4
·
verified ·
1 Parent(s): 91d0831

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -149
app.py CHANGED
@@ -1,149 +1,158 @@
1
- import streamlit as st
2
- from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
3
- from nltk.tokenize import word_tokenize
4
- from nltk.corpus import stopwords
5
- from nltk.stem import WordNetLemmatizer
6
- from sklearn.feature_extraction.text import CountVectorizer
7
- from sklearn.decomposition import LatentDirichletAllocation
8
- from transformers import pipeline
9
- from textblob import TextBlob
10
- import re
11
- import nltk
12
-
13
- # Ensure that necessary NLTK data is downloaded
14
- nltk.download('punkt')
15
- nltk.download('stopwords')
16
- nltk.download('wordnet')
17
-
18
- # Function to summarize text
19
- def summarize_text(text, max_length=80000): # Increased max_length to 80,000
20
- summarization_pipeline = pipeline("summarization")
21
- summary = summarization_pipeline(text, max_length=max_length, min_length=100, do_sample=False)
22
- return summary[0]['summary_text']
23
-
24
- # Function to extract keywords
25
- def extract_keywords(text):
26
- stop_words = set(stopwords.words('english'))
27
- lemmatizer = WordNetLemmatizer()
28
-
29
- words = word_tokenize(text)
30
- words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum()]
31
- keywords = [word for word in words if word not in stop_words and len(word) > 1]
32
-
33
- counter = CountVectorizer().fit_transform([' '.join(keywords)])
34
- vocabulary = CountVectorizer().fit([' '.join(keywords)]).vocabulary_
35
- top_keywords = sorted(vocabulary, key=vocabulary.get, reverse=True)[:5]
36
-
37
- return top_keywords
38
-
39
- # Function to perform topic modeling
40
- def topic_modeling(text):
41
- vectorizer = CountVectorizer(max_df=2, min_df=0.95, stop_words='english')
42
- tf = vectorizer.fit_transform([text])
43
- lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
44
- lda_model.fit(tf)
45
- feature_names = vectorizer.get_feature_names_out()
46
- topics = []
47
- for topic_idx, topic in enumerate(lda_model.components_):
48
- topics.append([feature_names[i] for i in topic.argsort()[:-6:-1]])
49
- return topics
50
-
51
- # Function to extract YouTube video ID from URL
52
- def extract_video_id(url):
53
- video_id = None
54
- patterns = [
55
- r'v=([^&]+)', # Pattern for URLs with 'v=' parameter
56
- r'youtu.be/([^?]+)', # Pattern for shortened URLs
57
- r'youtube.com/embed/([^?]+)' # Pattern for embed URLs
58
- ]
59
- for pattern in patterns:
60
- match = re.search(pattern, url)
61
- if match:
62
- video_id = match.group(1)
63
- break
64
- return video_id
65
-
66
- # Main Streamlit app
67
- def main():
68
- st.title("YouTube Video Summarizer")
69
-
70
- # Sidebar
71
- st.sidebar.title("App Features and Description")
72
- st.sidebar.subheader("What does this app do?")
73
- st.sidebar.write("""
74
- - Extracts the transcript of a YouTube video.
75
- - Summarizes the video content.
76
- - Extracts keywords from the transcript.
77
- - Performs topic modeling.
78
- - Conducts sentiment analysis to show how positive or negative the content is.
79
- """)
80
-
81
- st.sidebar.subheader("Sentiment Analysis Explained")
82
- st.sidebar.write("""
83
- - **Polarity**: Measures how positive or negative the content is. Ranges from -1 to 1.
84
- - Negative polarity (< 0) indicates negative sentiment.
85
- - Positive polarity (> 0) indicates positive sentiment.
86
- - Neutral polarity (0) indicates neutral sentiment.
87
- - **Subjectivity**: Measures how subjective or objective the content is. Ranges from 0 to 1.
88
- - A subjectivity score closer to 1 indicates personal opinions or beliefs.
89
- - A score closer to 0 indicates factual information.
90
- """)
91
-
92
- # User input for YouTube video URL
93
- video_url = st.text_input("Enter YouTube Video URL:", "")
94
-
95
- # User customization options
96
- max_summary_length = st.slider("Max Summary Length:", 1000, 80000, 50000) # Increased max length to 80,000
97
-
98
- if st.button("Summarize"):
99
- try:
100
- # Extract video ID from URL
101
- video_id = extract_video_id(video_url)
102
- if not video_id:
103
- st.error("Invalid YouTube URL. Please enter a valid URL.")
104
- return
105
-
106
- # Get transcript of the video
107
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
108
- if not transcript:
109
- st.error("Transcript not available for this video. Please try a different video.")
110
- return
111
-
112
- video_text = ' '.join([line['text'] for line in transcript])
113
-
114
- # Summarize the transcript
115
- summary = summarize_text(video_text, max_length=max_summary_length)
116
-
117
- # Extract keywords from the transcript
118
- keywords = extract_keywords(video_text)
119
-
120
- # Perform topic modeling
121
- topics = topic_modeling(video_text)
122
-
123
- # Perform sentiment analysis
124
- sentiment = TextBlob(video_text).sentiment
125
-
126
- # Display summarized text, keywords, topics, and sentiment
127
- st.subheader("Video Summary:")
128
- st.write(summary)
129
-
130
- st.subheader("Keywords:")
131
- st.write(keywords)
132
-
133
- st.subheader("Topics:")
134
- for idx, topic in enumerate(topics):
135
- st.write(f"Topic {idx+1}: {', '.join(topic)}")
136
-
137
- st.subheader("Sentiment Analysis:")
138
- st.write(f"Polarity: {sentiment.polarity}")
139
- st.write(f"Subjectivity: {sentiment.subjectivity}")
140
-
141
- except TranscriptsDisabled:
142
- st.error("Transcripts are disabled for this video. Please try a different video.")
143
- except NoTranscriptFound:
144
- st.error("No transcript found for this video. Please try a different video.")
145
- except Exception as e:
146
- st.error(f"Error: {str(e)}")
147
-
148
- if __name__ == "__main__":
149
- main()
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
3
+ from nltk.tokenize import word_tokenize
4
+ from nltk.corpus import stopwords
5
+ from nltk.stem import WordNetLemmatizer
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ from sklearn.decomposition import LatentDirichletAllocation
8
+ from transformers import pipeline
9
+ from textblob import TextBlob
10
+ import re
11
+ import nltk
12
+
13
+ # Ensure that necessary NLTK data is downloaded
14
+ nltk.download('punkt')
15
+ nltk.download('stopwords')
16
+ nltk.download('wordnet')
17
+
18
+ # Function to summarize text
19
+ def summarize_text(text, max_length=80000): # Increased max_length to 80,000
20
+ summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
21
+ summary = summarization_pipeline(text, max_length=max_length, min_length=100, do_sample=False)
22
+ return summary[0]['summary_text']
23
+
24
+ # Function to extract keywords
25
+ def extract_keywords(text):
26
+ stop_words = set(stopwords.words('english'))
27
+ lemmatizer = WordNetLemmatizer()
28
+
29
+ words = word_tokenize(text)
30
+ words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum()]
31
+ keywords = [word for word in words if word not in stop_words and len(word) > 1]
32
+
33
+ counter = CountVectorizer().fit_transform([' '.join(keywords)])
34
+ vocabulary = CountVectorizer().fit([' '.join(keywords)]).vocabulary_
35
+ top_keywords = sorted(vocabulary, key=vocabulary.get, reverse=True)[:5]
36
+
37
+ return top_keywords
38
+
39
+ # Function to perform topic modeling
40
+ def topic_modeling(text):
41
+ vectorizer = CountVectorizer(max_df=2, min_df=0.95, stop_words='english')
42
+ tf = vectorizer.fit_transform([text])
43
+ lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
44
+ lda_model.fit(tf)
45
+ feature_names = vectorizer.get_feature_names_out()
46
+ topics = []
47
+ for topic_idx, topic in enumerate(lda_model.components_):
48
+ topics.append([feature_names[i] for i in topic.argsort()[:-6:-1]])
49
+ return topics
50
+
51
+ # Function to extract YouTube video ID from URL
52
+ def extract_video_id(url):
53
+ video_id = None
54
+ patterns = [
55
+ r'v=([^&]+)', # Pattern for URLs with 'v=' parameter
56
+ r'youtu.be/([^?]+)', # Pattern for shortened URLs
57
+ r'youtube.com/embed/([^?]+)' # Pattern for embed URLs
58
+ ]
59
+ for pattern in patterns:
60
+ match = re.search(pattern, url)
61
+ if match:
62
+ video_id = match.group(1)
63
+ break
64
+ return video_id
65
+
66
+ # Function to fetch transcript with retries
67
+ def get_transcript(video_id):
68
+ try:
69
+ # Attempt to fetch the transcript with language preference
70
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
71
+ return transcript
72
+ except TranscriptsDisabled:
73
+ st.error("Transcripts are disabled for this video. Try a different one.")
74
+ except NoTranscriptFound:
75
+ st.error("No transcript found for this video. Ensure it has captions enabled.")
76
+ except Exception as e:
77
+ st.error(f"An unexpected error occurred while fetching the transcript: {str(e)}")
78
+ return None
79
+
80
+ # Main Streamlit app
81
+ def main():
82
+ st.title("YouTube Video Summarizer")
83
+
84
+ # Sidebar
85
+ st.sidebar.title("App Features and Description")
86
+ st.sidebar.subheader("What does this app do?")
87
+ st.sidebar.write("""
88
+ - Extracts the transcript of a YouTube video.
89
+ - Summarizes the video content.
90
+ - Extracts keywords from the transcript.
91
+ - Performs topic modeling.
92
+ - Conducts sentiment analysis to show how positive or negative the content is.
93
+ """)
94
+
95
+ st.sidebar.subheader("Sentiment Analysis Explained")
96
+ st.sidebar.write("""
97
+ - **Polarity**: Measures how positive or negative the content is. Ranges from -1 to 1.
98
+ - Negative polarity (< 0) indicates negative sentiment.
99
+ - Positive polarity (> 0) indicates positive sentiment.
100
+ - Neutral polarity (0) indicates neutral sentiment.
101
+ - **Subjectivity**: Measures how subjective or objective the content is. Ranges from 0 to 1.
102
+ - A subjectivity score closer to 1 indicates personal opinions or beliefs.
103
+ - A score closer to 0 indicates factual information.
104
+ """)
105
+
106
+ # User input for YouTube video URL
107
+ video_url = st.text_input("Enter YouTube Video URL:", "")
108
+
109
+ # User customization options
110
+ max_summary_length = st.slider("Max Summary Length:", 1000, 80000, 50000) # Increased max length to 80,000
111
+
112
+ if st.button("Summarize"):
113
+ try:
114
+ # Extract video ID from URL
115
+ video_id = extract_video_id(video_url)
116
+ if not video_id:
117
+ st.error("Invalid YouTube URL. Please enter a valid URL.")
118
+ return
119
+
120
+ # Get transcript of the video
121
+ transcript = get_transcript(video_id)
122
+ if not transcript:
123
+ return
124
+
125
+ video_text = ' '.join([line['text'] for line in transcript])
126
+
127
+ # Summarize the transcript
128
+ summary = summarize_text(video_text, max_length=max_summary_length)
129
+
130
+ # Extract keywords from the transcript
131
+ keywords = extract_keywords(video_text)
132
+
133
+ # Perform topic modeling
134
+ topics = topic_modeling(video_text)
135
+
136
+ # Perform sentiment analysis
137
+ sentiment = TextBlob(video_text).sentiment
138
+
139
+ # Display summarized text, keywords, topics, and sentiment
140
+ st.subheader("Video Summary:")
141
+ st.write(summary)
142
+
143
+ st.subheader("Keywords:")
144
+ st.write(keywords)
145
+
146
+ st.subheader("Topics:")
147
+ for idx, topic in enumerate(topics):
148
+ st.write(f"Topic {idx+1}: {', '.join(topic)}")
149
+
150
+ st.subheader("Sentiment Analysis:")
151
+ st.write(f"Polarity: {sentiment.polarity}")
152
+ st.write(f"Subjectivity: {sentiment.subjectivity}")
153
+
154
+ except Exception as e:
155
+ st.error(f"Error: {str(e)}")
156
+
157
+ if __name__ == "__main__":
158
+ main()