Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,149 +1,158 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
3 |
-
from nltk.tokenize import word_tokenize
|
4 |
-
from nltk.corpus import stopwords
|
5 |
-
from nltk.stem import WordNetLemmatizer
|
6 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
-
from sklearn.decomposition import LatentDirichletAllocation
|
8 |
-
from transformers import pipeline
|
9 |
-
from textblob import TextBlob
|
10 |
-
import re
|
11 |
-
import nltk
|
12 |
-
|
13 |
-
# Ensure that necessary NLTK data is downloaded
|
14 |
-
nltk.download('punkt')
|
15 |
-
nltk.download('stopwords')
|
16 |
-
nltk.download('wordnet')
|
17 |
-
|
18 |
-
# Function to summarize text
|
19 |
-
def summarize_text(text, max_length=80000): # Increased max_length to 80,000
|
20 |
-
summarization_pipeline = pipeline("summarization")
|
21 |
-
summary = summarization_pipeline(text, max_length=max_length, min_length=100, do_sample=False)
|
22 |
-
return summary[0]['summary_text']
|
23 |
-
|
24 |
-
# Function to extract keywords
|
25 |
-
def extract_keywords(text):
|
26 |
-
stop_words = set(stopwords.words('english'))
|
27 |
-
lemmatizer = WordNetLemmatizer()
|
28 |
-
|
29 |
-
words = word_tokenize(text)
|
30 |
-
words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum()]
|
31 |
-
keywords = [word for word in words if word not in stop_words and len(word) > 1]
|
32 |
-
|
33 |
-
counter = CountVectorizer().fit_transform([' '.join(keywords)])
|
34 |
-
vocabulary = CountVectorizer().fit([' '.join(keywords)]).vocabulary_
|
35 |
-
top_keywords = sorted(vocabulary, key=vocabulary.get, reverse=True)[:5]
|
36 |
-
|
37 |
-
return top_keywords
|
38 |
-
|
39 |
-
# Function to perform topic modeling
|
40 |
-
def topic_modeling(text):
|
41 |
-
vectorizer = CountVectorizer(max_df=2, min_df=0.95, stop_words='english')
|
42 |
-
tf = vectorizer.fit_transform([text])
|
43 |
-
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
|
44 |
-
lda_model.fit(tf)
|
45 |
-
feature_names = vectorizer.get_feature_names_out()
|
46 |
-
topics = []
|
47 |
-
for topic_idx, topic in enumerate(lda_model.components_):
|
48 |
-
topics.append([feature_names[i] for i in topic.argsort()[:-6:-1]])
|
49 |
-
return topics
|
50 |
-
|
51 |
-
# Function to extract YouTube video ID from URL
|
52 |
-
def extract_video_id(url):
|
53 |
-
video_id = None
|
54 |
-
patterns = [
|
55 |
-
r'v=([^&]+)', # Pattern for URLs with 'v=' parameter
|
56 |
-
r'youtu.be/([^?]+)', # Pattern for shortened URLs
|
57 |
-
r'youtube.com/embed/([^?]+)' # Pattern for embed URLs
|
58 |
-
]
|
59 |
-
for pattern in patterns:
|
60 |
-
match = re.search(pattern, url)
|
61 |
-
if match:
|
62 |
-
video_id = match.group(1)
|
63 |
-
break
|
64 |
-
return video_id
|
65 |
-
|
66 |
-
#
|
67 |
-
def
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
st.
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
#
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
#
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
st.
|
145 |
-
|
146 |
-
st.
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
from nltk.corpus import stopwords
|
5 |
+
from nltk.stem import WordNetLemmatizer
|
6 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
8 |
+
from transformers import pipeline
|
9 |
+
from textblob import TextBlob
|
10 |
+
import re
|
11 |
+
import nltk
|
12 |
+
|
13 |
+
# Ensure that necessary NLTK data is downloaded
|
14 |
+
nltk.download('punkt')
|
15 |
+
nltk.download('stopwords')
|
16 |
+
nltk.download('wordnet')
|
17 |
+
|
18 |
+
# Function to summarize text
|
19 |
+
def summarize_text(text, max_length=80000): # Increased max_length to 80,000
|
20 |
+
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
|
21 |
+
summary = summarization_pipeline(text, max_length=max_length, min_length=100, do_sample=False)
|
22 |
+
return summary[0]['summary_text']
|
23 |
+
|
24 |
+
# Function to extract keywords
|
25 |
+
def extract_keywords(text):
|
26 |
+
stop_words = set(stopwords.words('english'))
|
27 |
+
lemmatizer = WordNetLemmatizer()
|
28 |
+
|
29 |
+
words = word_tokenize(text)
|
30 |
+
words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum()]
|
31 |
+
keywords = [word for word in words if word not in stop_words and len(word) > 1]
|
32 |
+
|
33 |
+
counter = CountVectorizer().fit_transform([' '.join(keywords)])
|
34 |
+
vocabulary = CountVectorizer().fit([' '.join(keywords)]).vocabulary_
|
35 |
+
top_keywords = sorted(vocabulary, key=vocabulary.get, reverse=True)[:5]
|
36 |
+
|
37 |
+
return top_keywords
|
38 |
+
|
39 |
+
# Function to perform topic modeling
|
40 |
+
def topic_modeling(text):
|
41 |
+
vectorizer = CountVectorizer(max_df=2, min_df=0.95, stop_words='english')
|
42 |
+
tf = vectorizer.fit_transform([text])
|
43 |
+
lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
|
44 |
+
lda_model.fit(tf)
|
45 |
+
feature_names = vectorizer.get_feature_names_out()
|
46 |
+
topics = []
|
47 |
+
for topic_idx, topic in enumerate(lda_model.components_):
|
48 |
+
topics.append([feature_names[i] for i in topic.argsort()[:-6:-1]])
|
49 |
+
return topics
|
50 |
+
|
51 |
+
# Function to extract YouTube video ID from URL
|
52 |
+
def extract_video_id(url):
|
53 |
+
video_id = None
|
54 |
+
patterns = [
|
55 |
+
r'v=([^&]+)', # Pattern for URLs with 'v=' parameter
|
56 |
+
r'youtu.be/([^?]+)', # Pattern for shortened URLs
|
57 |
+
r'youtube.com/embed/([^?]+)' # Pattern for embed URLs
|
58 |
+
]
|
59 |
+
for pattern in patterns:
|
60 |
+
match = re.search(pattern, url)
|
61 |
+
if match:
|
62 |
+
video_id = match.group(1)
|
63 |
+
break
|
64 |
+
return video_id
|
65 |
+
|
66 |
+
# Function to fetch transcript with retries
|
67 |
+
def get_transcript(video_id):
|
68 |
+
try:
|
69 |
+
# Attempt to fetch the transcript with language preference
|
70 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
|
71 |
+
return transcript
|
72 |
+
except TranscriptsDisabled:
|
73 |
+
st.error("Transcripts are disabled for this video. Try a different one.")
|
74 |
+
except NoTranscriptFound:
|
75 |
+
st.error("No transcript found for this video. Ensure it has captions enabled.")
|
76 |
+
except Exception as e:
|
77 |
+
st.error(f"An unexpected error occurred while fetching the transcript: {str(e)}")
|
78 |
+
return None
|
79 |
+
|
80 |
+
# Main Streamlit app
|
81 |
+
def main():
|
82 |
+
st.title("YouTube Video Summarizer")
|
83 |
+
|
84 |
+
# Sidebar
|
85 |
+
st.sidebar.title("App Features and Description")
|
86 |
+
st.sidebar.subheader("What does this app do?")
|
87 |
+
st.sidebar.write("""
|
88 |
+
- Extracts the transcript of a YouTube video.
|
89 |
+
- Summarizes the video content.
|
90 |
+
- Extracts keywords from the transcript.
|
91 |
+
- Performs topic modeling.
|
92 |
+
- Conducts sentiment analysis to show how positive or negative the content is.
|
93 |
+
""")
|
94 |
+
|
95 |
+
st.sidebar.subheader("Sentiment Analysis Explained")
|
96 |
+
st.sidebar.write("""
|
97 |
+
- **Polarity**: Measures how positive or negative the content is. Ranges from -1 to 1.
|
98 |
+
- Negative polarity (< 0) indicates negative sentiment.
|
99 |
+
- Positive polarity (> 0) indicates positive sentiment.
|
100 |
+
- Neutral polarity (0) indicates neutral sentiment.
|
101 |
+
- **Subjectivity**: Measures how subjective or objective the content is. Ranges from 0 to 1.
|
102 |
+
- A subjectivity score closer to 1 indicates personal opinions or beliefs.
|
103 |
+
- A score closer to 0 indicates factual information.
|
104 |
+
""")
|
105 |
+
|
106 |
+
# User input for YouTube video URL
|
107 |
+
video_url = st.text_input("Enter YouTube Video URL:", "")
|
108 |
+
|
109 |
+
# User customization options
|
110 |
+
max_summary_length = st.slider("Max Summary Length:", 1000, 80000, 50000) # Increased max length to 80,000
|
111 |
+
|
112 |
+
if st.button("Summarize"):
|
113 |
+
try:
|
114 |
+
# Extract video ID from URL
|
115 |
+
video_id = extract_video_id(video_url)
|
116 |
+
if not video_id:
|
117 |
+
st.error("Invalid YouTube URL. Please enter a valid URL.")
|
118 |
+
return
|
119 |
+
|
120 |
+
# Get transcript of the video
|
121 |
+
transcript = get_transcript(video_id)
|
122 |
+
if not transcript:
|
123 |
+
return
|
124 |
+
|
125 |
+
video_text = ' '.join([line['text'] for line in transcript])
|
126 |
+
|
127 |
+
# Summarize the transcript
|
128 |
+
summary = summarize_text(video_text, max_length=max_summary_length)
|
129 |
+
|
130 |
+
# Extract keywords from the transcript
|
131 |
+
keywords = extract_keywords(video_text)
|
132 |
+
|
133 |
+
# Perform topic modeling
|
134 |
+
topics = topic_modeling(video_text)
|
135 |
+
|
136 |
+
# Perform sentiment analysis
|
137 |
+
sentiment = TextBlob(video_text).sentiment
|
138 |
+
|
139 |
+
# Display summarized text, keywords, topics, and sentiment
|
140 |
+
st.subheader("Video Summary:")
|
141 |
+
st.write(summary)
|
142 |
+
|
143 |
+
st.subheader("Keywords:")
|
144 |
+
st.write(keywords)
|
145 |
+
|
146 |
+
st.subheader("Topics:")
|
147 |
+
for idx, topic in enumerate(topics):
|
148 |
+
st.write(f"Topic {idx+1}: {', '.join(topic)}")
|
149 |
+
|
150 |
+
st.subheader("Sentiment Analysis:")
|
151 |
+
st.write(f"Polarity: {sentiment.polarity}")
|
152 |
+
st.write(f"Subjectivity: {sentiment.subjectivity}")
|
153 |
+
|
154 |
+
except Exception as e:
|
155 |
+
st.error(f"Error: {str(e)}")
|
156 |
+
|
157 |
+
if __name__ == "__main__":
|
158 |
+
main()
|