fixed getting download clip
Browse files- ttv_web_scraper.py +200 -237
- video_utils.py +6 -4
ttv_web_scraper.py
CHANGED
@@ -5,33 +5,52 @@ import os
|
|
5 |
import traceback
|
6 |
from pyppeteer import launch
|
7 |
from bs4 import BeautifulSoup, NavigableString
|
8 |
-
import hashlib
|
9 |
from ai_config_faiss import get_ai_assistant
|
10 |
from video_utils import generate_clips
|
|
|
|
|
|
|
11 |
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
|
|
|
17 |
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
|
18 |
SUBJECTS = [
|
19 |
-
" 5G ",
|
20 |
-
" TechCo ",
|
21 |
]
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
25 |
browser = None
|
26 |
try:
|
27 |
browser = await launch()
|
28 |
page = await browser.newPage()
|
29 |
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
|
30 |
await asyncio.sleep(5)
|
31 |
-
|
32 |
-
return content
|
33 |
except Exception as e:
|
34 |
-
|
|
|
35 |
finally:
|
36 |
if browser:
|
37 |
await browser.close()
|
@@ -47,7 +66,7 @@ def extract_text_with_br(element):
|
|
47 |
return ''.join(result).strip()
|
48 |
|
49 |
|
50 |
-
def extract_info(html_content):
|
51 |
try:
|
52 |
soup = BeautifulSoup(html_content, 'html.parser')
|
53 |
title = soup.title.string.strip() if soup.title else None
|
@@ -55,289 +74,233 @@ def extract_info(html_content):
|
|
55 |
date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
|
56 |
youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
|
57 |
youtube_url = youtube_iframe['src'] if youtube_iframe else None
|
58 |
-
youtube_id = None
|
59 |
-
if youtube_url:
|
60 |
-
match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
|
61 |
-
if match:
|
62 |
-
youtube_id = match.group(1)
|
63 |
transcript_elem = soup.find(id='transcript0')
|
64 |
transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
},
|
71 |
-
'transcript': transcript
|
72 |
-
}
|
73 |
except Exception as e:
|
74 |
-
|
|
|
75 |
|
76 |
|
77 |
-
def
|
78 |
try:
|
79 |
if os.path.exists(filename):
|
80 |
with open(filename, 'r', encoding='utf-8') as f:
|
81 |
return f.read()
|
82 |
return None
|
83 |
except Exception as e:
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
97 |
|
|
|
98 |
|
99 |
-
def extract_subject_info(text):
|
100 |
-
# Convert text to lowercase for case-insensitive matching
|
101 |
-
lower_text = text.lower()
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
subject for subject in SUBJECTS if subject.lower() in lower_text]
|
106 |
|
107 |
-
return found_subjects
|
108 |
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
|
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
except Exception as e:
|
122 |
-
raise Exception(f"Error extracting speaker info: {str(e)}")
|
123 |
|
|
|
|
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
saved_info = None
|
129 |
-
|
130 |
-
pattern = r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)'
|
131 |
-
segments = re.split(pattern, content)
|
132 |
-
segments = [segment.strip() for segment in segments if segment.strip()]
|
133 |
-
|
134 |
-
for i, segment in enumerate(segments):
|
135 |
-
speaker_info = extract_speaker_info(segment)
|
136 |
-
if speaker_info:
|
137 |
-
if speaker_info['speaker']:
|
138 |
-
# Full speaker, company, timestamp format
|
139 |
-
if saved_info:
|
140 |
-
text = segments[i-1] if i > 0 else ""
|
141 |
-
subjects = extract_subject_info(text)
|
142 |
-
parsed_segments.append({
|
143 |
-
'metadata': {
|
144 |
-
'speaker': saved_info['speaker'],
|
145 |
-
'company': saved_info['company'],
|
146 |
-
'start_timestamp': saved_info['timestamp'],
|
147 |
-
'end_timestamp': speaker_info['timestamp'],
|
148 |
-
'subjects': subjects
|
149 |
-
},
|
150 |
-
'text': text
|
151 |
-
})
|
152 |
-
saved_info = speaker_info
|
153 |
-
else:
|
154 |
-
# Standalone timestamp format
|
155 |
-
if saved_info:
|
156 |
-
text = segments[i-1] if i > 0 else ""
|
157 |
-
subjects = extract_subject_info(text)
|
158 |
-
parsed_segments.append({
|
159 |
-
'metadata': {
|
160 |
-
'speaker': saved_info['speaker'],
|
161 |
-
'company': saved_info['company'],
|
162 |
-
'start_timestamp': saved_info['timestamp'],
|
163 |
-
'end_timestamp': speaker_info['timestamp'],
|
164 |
-
'subjects': subjects
|
165 |
-
},
|
166 |
-
'text': text
|
167 |
-
})
|
168 |
-
saved_info['timestamp'] = speaker_info['timestamp']
|
169 |
-
elif saved_info:
|
170 |
-
# Text segment
|
171 |
-
continue
|
172 |
-
|
173 |
-
# Add final entry
|
174 |
-
if saved_info:
|
175 |
-
text = segments[-1]
|
176 |
-
subjects = extract_subject_info(text)
|
177 |
-
parsed_segments.append({
|
178 |
-
'metadata': {
|
179 |
-
'speaker': saved_info['speaker'],
|
180 |
-
'company': saved_info['company'],
|
181 |
-
'start_timestamp': saved_info['timestamp'],
|
182 |
-
'end_timestamp': "00:00:00",
|
183 |
-
'subjects': subjects
|
184 |
-
},
|
185 |
-
'text': text
|
186 |
-
})
|
187 |
-
|
188 |
-
return parsed_segments
|
189 |
-
except Exception as e:
|
190 |
-
raise Exception(f"Error parsing transcript: {str(e)}")
|
191 |
|
|
|
192 |
|
193 |
-
|
194 |
-
|
|
|
195 |
|
196 |
|
197 |
-
async def
|
198 |
-
|
199 |
-
cached_filename = get_cached_filename(url)
|
200 |
-
json_filename = f"{cached_filename}.json"
|
201 |
-
info = read_json_from_file(json_filename)
|
202 |
|
203 |
-
if info:
|
204 |
-
return info
|
205 |
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
content = await get_client_rendered_content(url)
|
211 |
-
with open(cached_filename, 'w', encoding='utf-8') as f:
|
212 |
-
f.write(content)
|
213 |
-
else:
|
214 |
-
print(f"Using cached content from file for {url}...")
|
215 |
|
216 |
-
info = extract_info(content)
|
217 |
-
transcript = info['transcript']
|
218 |
-
if (transcript):
|
219 |
-
info['transcript'] = parse_transcript(transcript)
|
220 |
-
generate_clips(CACHE_DIR, info)
|
221 |
-
with open(json_filename, 'w', encoding='utf-8') as f:
|
222 |
-
json.dump(info, f, ensure_ascii=False, indent=4)
|
223 |
-
print(f"Information extracted and saved to {json_filename}")
|
224 |
-
else:
|
225 |
-
print(f"No transcript found for {url}")
|
226 |
-
return info
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
return None
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
-
|
236 |
-
tasks = [process_url(url) for url in urls]
|
237 |
-
return await asyncio.gather(*tasks)
|
238 |
|
239 |
|
240 |
-
def main():
|
241 |
-
global assistant
|
242 |
assistant = get_ai_assistant()
|
243 |
-
|
244 |
-
url_file = "dsp-urls-one.txt" # File containing list of URLs
|
245 |
|
246 |
if not os.path.exists(url_file):
|
247 |
-
|
248 |
return
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
# Convert companies to a dictionary of speaker sets if it's not already
|
253 |
-
if not isinstance(companies, dict):
|
254 |
-
companies = {company: set() for company in companies}
|
255 |
|
256 |
with open(url_file, 'r') as f:
|
257 |
urls = [line.strip() for line in f if line.strip()]
|
258 |
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
if filename_hash in content_hashes:
|
264 |
-
print(f"{url} already added")
|
265 |
continue
|
266 |
|
267 |
-
info
|
|
|
268 |
if info is None:
|
|
|
269 |
continue
|
270 |
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
|
277 |
-
|
278 |
-
|
279 |
-
company = metadata['company']
|
280 |
-
speaker = metadata['speaker']
|
281 |
-
entry_subjects = metadata['subjects']
|
282 |
-
|
283 |
-
speakers.add(speaker)
|
284 |
-
# Add new subjects to the master set
|
285 |
subjects.update(entry_subjects)
|
286 |
|
287 |
-
text =
|
288 |
-
|
289 |
-
assistant.add_to_knowledge_base(
|
290 |
-
text, data_type='text', metadata=metadata.copy())
|
291 |
|
292 |
-
if company
|
293 |
-
companies
|
294 |
-
companies[company].add(speaker)
|
295 |
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
# Save updated hashes and metadata
|
300 |
-
save_metadata_sets(content_hashes, speakers,
|
301 |
-
companies, sentiments, subjects)
|
302 |
|
|
|
303 |
assistant.save()
|
304 |
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
|
309 |
-
metadata = {
|
310 |
-
'content_hashes': list(content_hashes),
|
311 |
-
'speakers': list(speakers),
|
312 |
-
'companies': {company: list(speakers) for company, speakers in companies.items()},
|
313 |
-
'sentiments': list(sentiments),
|
314 |
-
'subjects': list(subjects)
|
315 |
-
}
|
316 |
-
|
317 |
-
with open(DB_METADATA_FILE, 'w') as f:
|
318 |
-
json.dump(metadata, f, indent=2)
|
319 |
-
|
320 |
-
|
321 |
-
def db_load_metadata_sets():
|
322 |
-
content_hashes = set()
|
323 |
-
speakers = set()
|
324 |
-
companies = {}
|
325 |
-
sentiments = set()
|
326 |
-
subjects = set()
|
327 |
-
|
328 |
-
if os.path.exists(DB_METADATA_FILE):
|
329 |
-
with open(DB_METADATA_FILE, 'r') as f:
|
330 |
-
metadata = json.load(f)
|
331 |
-
|
332 |
-
content_hashes = set(metadata.get('content_hashes', []))
|
333 |
-
speakers = set(metadata.get('speakers', []))
|
334 |
-
companies = {company: set(speakers) for company, speakers in metadata.get(
|
335 |
-
'companies', {}).items()}
|
336 |
-
sentiments = set(metadata.get('sentiments', []))
|
337 |
-
subjects = set(metadata.get('subjects', SUBJECTS))
|
338 |
-
|
339 |
-
return content_hashes, speakers, companies, sentiments, subjects
|
340 |
|
341 |
|
342 |
if __name__ == "__main__":
|
343 |
-
main()
|
|
|
5 |
import traceback
|
6 |
from pyppeteer import launch
|
7 |
from bs4 import BeautifulSoup, NavigableString
|
|
|
8 |
from ai_config_faiss import get_ai_assistant
|
9 |
from video_utils import generate_clips
|
10 |
+
from typing import Dict, List, Set, Optional
|
11 |
+
from dataclasses import dataclass, asdict
|
12 |
+
import logging
|
13 |
|
14 |
+
# Set the TOKENIZERS_PARALLELISM environment variable
|
15 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
16 |
|
17 |
+
# Set up logging
|
18 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
|
21 |
+
CACHE_DIR = "cache/"
|
22 |
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
|
23 |
SUBJECTS = [
|
24 |
+
" 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
|
25 |
+
" TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
|
26 |
]
|
27 |
|
28 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
29 |
+
|
30 |
+
|
31 |
+
@dataclass
|
32 |
+
class TranscriptSegment:
|
33 |
+
metadata: Dict[str, Optional[str]]
|
34 |
+
text: str
|
35 |
+
|
36 |
+
|
37 |
+
@dataclass
|
38 |
+
class VideoInfo:
|
39 |
+
metadata: Dict[str, Optional[str]]
|
40 |
+
transcript: List[TranscriptSegment]
|
41 |
|
42 |
+
|
43 |
+
async def get_client_rendered_content(url: str) -> str:
|
44 |
browser = None
|
45 |
try:
|
46 |
browser = await launch()
|
47 |
page = await browser.newPage()
|
48 |
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
|
49 |
await asyncio.sleep(5)
|
50 |
+
return await page.content()
|
|
|
51 |
except Exception as e:
|
52 |
+
logger.error(f"Error fetching content for {url}: {str(e)}")
|
53 |
+
raise
|
54 |
finally:
|
55 |
if browser:
|
56 |
await browser.close()
|
|
|
66 |
return ''.join(result).strip()
|
67 |
|
68 |
|
69 |
+
def extract_info(html_content: str) -> VideoInfo:
|
70 |
try:
|
71 |
soup = BeautifulSoup(html_content, 'html.parser')
|
72 |
title = soup.title.string.strip() if soup.title else None
|
|
|
74 |
date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
|
75 |
youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
|
76 |
youtube_url = youtube_iframe['src'] if youtube_iframe else None
|
77 |
+
youtube_id = re.search(r'youtube\.com/embed/([^?]+)', youtube_url).group(1) if youtube_url else None
|
|
|
|
|
|
|
|
|
78 |
transcript_elem = soup.find(id='transcript0')
|
79 |
transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
|
80 |
+
|
81 |
+
return VideoInfo(
|
82 |
+
metadata={'title': title, 'date': date, 'youtube_id': youtube_id},
|
83 |
+
transcript=parse_transcript(transcript) if transcript else []
|
84 |
+
)
|
|
|
|
|
|
|
85 |
except Exception as e:
|
86 |
+
logger.error(f"Error extracting information: {str(e)}")
|
87 |
+
raise
|
88 |
|
89 |
|
90 |
+
def read_file(filename: str) -> Optional[str]:
|
91 |
try:
|
92 |
if os.path.exists(filename):
|
93 |
with open(filename, 'r', encoding='utf-8') as f:
|
94 |
return f.read()
|
95 |
return None
|
96 |
except Exception as e:
|
97 |
+
logger.error(f"Error reading file {filename}: {str(e)}")
|
98 |
+
raise
|
99 |
+
|
100 |
+
|
101 |
+
def extract_subject_info(text: str) -> List[str]:
|
102 |
+
return [subject for subject in SUBJECTS if subject.lower() in text.lower()]
|
103 |
+
|
104 |
+
|
105 |
+
def extract_speaker_info(segment: str) -> Optional[Dict[str, Optional[str]]]:
|
106 |
+
pattern = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
|
107 |
+
|
108 |
+
match = re.match(pattern, segment)
|
109 |
+
return {key: value.strip() if value else None for key, value in match.groupdict().items()} if match else None
|
110 |
+
|
111 |
+
|
112 |
+
def parse_transcript(content: str) -> List[TranscriptSegment]:
|
113 |
+
parsed_segments = []
|
114 |
+
saved_info = None
|
115 |
+
|
116 |
+
segments = [segment.strip() for segment in re.split(r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)', content) if segment.strip()]
|
117 |
+
|
118 |
+
for i, segment in enumerate(segments):
|
119 |
+
speaker_info = extract_speaker_info(segment)
|
120 |
+
if speaker_info:
|
121 |
+
if speaker_info['speaker']:
|
122 |
+
if saved_info:
|
123 |
+
text = segments[i-1] if i > 0 else ""
|
124 |
+
parsed_segments.append(TranscriptSegment(
|
125 |
+
metadata={
|
126 |
+
'speaker': saved_info['speaker'],
|
127 |
+
'company': saved_info['company'],
|
128 |
+
'start_timestamp': saved_info['timestamp'],
|
129 |
+
'end_timestamp': speaker_info['timestamp'],
|
130 |
+
'subjects': extract_subject_info(text)
|
131 |
+
},
|
132 |
+
text=text
|
133 |
+
))
|
134 |
+
saved_info = speaker_info
|
135 |
+
if not saved_info['company']:
|
136 |
+
saved_info['company'] = "Unknown"
|
137 |
+
else:
|
138 |
+
if saved_info:
|
139 |
+
text = segments[i-1] if i > 0 else ""
|
140 |
+
parsed_segments.append(TranscriptSegment(
|
141 |
+
metadata={
|
142 |
+
'speaker': saved_info['speaker'],
|
143 |
+
'company': saved_info['company'],
|
144 |
+
'start_timestamp': saved_info['timestamp'],
|
145 |
+
'end_timestamp': speaker_info['timestamp'],
|
146 |
+
'subjects': extract_subject_info(text)
|
147 |
+
},
|
148 |
+
text=text
|
149 |
+
))
|
150 |
+
saved_info['timestamp'] = speaker_info['timestamp']
|
151 |
+
elif saved_info:
|
152 |
+
continue
|
153 |
|
154 |
+
if saved_info:
|
155 |
+
text = segments[-1]
|
156 |
+
parsed_segments.append(TranscriptSegment(
|
157 |
+
metadata={
|
158 |
+
'speaker': saved_info['speaker'],
|
159 |
+
'company': saved_info['company'],
|
160 |
+
'start_timestamp': saved_info['timestamp'],
|
161 |
+
'end_timestamp': "00:00:00",
|
162 |
+
'subjects': extract_subject_info(text)
|
163 |
+
},
|
164 |
+
text=text
|
165 |
+
))
|
166 |
|
167 |
+
return parsed_segments
|
168 |
|
|
|
|
|
|
|
169 |
|
170 |
+
def get_cached_filename(url: str) -> str:
|
171 |
+
return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}"
|
|
|
172 |
|
|
|
173 |
|
174 |
+
async def process_url(url: str) -> Optional[VideoInfo]:
|
175 |
+
try:
|
176 |
+
cached_filename = get_cached_filename(url)
|
177 |
+
html_filename = f"{cached_filename}.html"
|
178 |
+
json_filename = f"{cached_filename}.json"
|
179 |
|
180 |
+
if os.path.exists(json_filename):
|
181 |
+
logger.info(f"Using cached JSON for {url}")
|
182 |
+
with open(json_filename, 'r', encoding='utf-8') as f:
|
183 |
+
data = json.load(f)
|
184 |
+
return VideoInfo(
|
185 |
+
metadata=data['metadata'],
|
186 |
+
transcript=[TranscriptSegment(**segment) for segment in data['transcript']]
|
187 |
+
)
|
188 |
+
|
189 |
+
if os.path.exists(html_filename):
|
190 |
+
logger.info(f"Using cached HTML for {url}")
|
191 |
+
content = read_file(html_filename)
|
192 |
+
else:
|
193 |
+
logger.info(f"Fetching content from web for {url}")
|
194 |
+
content = await get_client_rendered_content(url)
|
195 |
+
with open(html_filename, 'w', encoding='utf-8') as f:
|
196 |
+
f.write(content)
|
197 |
|
198 |
+
info = extract_info(content)
|
199 |
|
200 |
+
if info.transcript:
|
201 |
+
logger.info(f"Generating clips for {url}")
|
202 |
+
info_dict = asdict(info)
|
203 |
+
info_dict['transcript'] = generate_clips(CACHE_DIR, info_dict)
|
204 |
+
info = VideoInfo(
|
205 |
+
metadata=info_dict['metadata'],
|
206 |
+
transcript=[TranscriptSegment(**segment) for segment in info_dict['transcript']]
|
207 |
+
)
|
|
|
|
|
208 |
|
209 |
+
with open(json_filename, 'w', encoding='utf-8') as f:
|
210 |
+
json.dump(asdict(info), f, ensure_ascii=False, indent=4)
|
211 |
|
212 |
+
logger.info(f"Information extracted and saved to {json_filename}")
|
213 |
+
else:
|
214 |
+
logger.warning(f"No transcript found for {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
+
return info
|
217 |
|
218 |
+
except Exception:
|
219 |
+
logger.error(f"Error processing URL {url}:\n{traceback.format_exc()}")
|
220 |
+
return None
|
221 |
|
222 |
|
223 |
+
async def process_urls(urls: List[str]) -> List[Optional[VideoInfo]]:
|
224 |
+
return await asyncio.gather(*[process_url(url) for url in urls])
|
|
|
|
|
|
|
225 |
|
|
|
|
|
226 |
|
227 |
+
def save_metadata_sets(processed_urls: Set[str], speakers: Set[str], companies: Dict[str, Set[str]], sentiments: Set[str], subjects: Set[str]):
|
228 |
+
metadata = {
|
229 |
+
'processed_urls': list(processed_urls),
|
230 |
+
'speakers': list(speakers),
|
231 |
+
'companies': {company: list(speakers) for company, speakers in companies.items()},
|
232 |
+
'sentiments': list(sentiments),
|
233 |
+
'subjects': list(subjects)
|
234 |
+
}
|
235 |
|
236 |
+
with open(DB_METADATA_FILE, 'w') as f:
|
237 |
+
json.dump(metadata, f, indent=2)
|
|
|
|
|
|
|
|
|
|
|
238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
+
def db_load_metadata_sets() -> tuple:
|
241 |
+
if os.path.exists(DB_METADATA_FILE):
|
242 |
+
with open(DB_METADATA_FILE, 'r') as f:
|
243 |
+
metadata = json.load(f)
|
|
|
244 |
|
245 |
+
return (
|
246 |
+
set(metadata.get('processed_urls', [])),
|
247 |
+
set(metadata.get('speakers', [])),
|
248 |
+
{company: set(speakers) for company, speakers in metadata.get('companies', {}).items()},
|
249 |
+
set(metadata.get('sentiments', [])),
|
250 |
+
set(metadata.get('subjects', SUBJECTS))
|
251 |
+
)
|
252 |
|
253 |
+
return set(), set(), {}, set(), set(SUBJECTS)
|
|
|
|
|
254 |
|
255 |
|
256 |
+
async def main():
|
|
|
257 |
assistant = get_ai_assistant()
|
258 |
+
url_file = "dsp-urls-one.txt"
|
|
|
259 |
|
260 |
if not os.path.exists(url_file):
|
261 |
+
logger.error(f"Error: {url_file} not found.")
|
262 |
return
|
263 |
|
264 |
+
processed_urls, speakers, companies, sentiments, subjects = db_load_metadata_sets()
|
|
|
|
|
|
|
|
|
265 |
|
266 |
with open(url_file, 'r') as f:
|
267 |
urls = [line.strip() for line in f if line.strip()]
|
268 |
|
269 |
+
total_urls = len(urls)
|
270 |
+
for i, url in enumerate(urls, 1):
|
271 |
+
if url in processed_urls:
|
272 |
+
logger.info(f"[{i}/{total_urls}] {url} already processed")
|
|
|
|
|
273 |
continue
|
274 |
|
275 |
+
logger.info(f"[{i}/{total_urls}] Processing {url}")
|
276 |
+
info = await process_url(url)
|
277 |
if info is None:
|
278 |
+
logger.warning(f"[{i}/{total_urls}] Failed to process {url}")
|
279 |
continue
|
280 |
|
281 |
+
for entry in info.transcript:
|
282 |
+
metadata = {**info.metadata, **entry.metadata}
|
283 |
+
company = metadata.get('company')
|
284 |
+
speaker = metadata.get('speaker')
|
285 |
+
entry_subjects = metadata.get('subjects', [])
|
286 |
|
287 |
+
if speaker:
|
288 |
+
speakers.add(speaker)
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
subjects.update(entry_subjects)
|
290 |
|
291 |
+
assistant.add_to_knowledge_base(entry.text, data_type='text', metadata=metadata.copy())
|
|
|
|
|
|
|
292 |
|
293 |
+
if company and speaker:
|
294 |
+
companies.setdefault(company, set()).add(speaker)
|
|
|
295 |
|
296 |
+
processed_urls.add(url)
|
297 |
+
logger.info(f"[{i}/{total_urls}] Added new url: {url}")
|
|
|
|
|
|
|
|
|
298 |
|
299 |
+
save_metadata_sets(processed_urls, speakers, companies, sentiments, subjects)
|
300 |
assistant.save()
|
301 |
|
302 |
+
logger.info("Processing complete. Check logs for any errors.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
|
305 |
if __name__ == "__main__":
|
306 |
+
asyncio.run(main())
|
video_utils.py
CHANGED
@@ -111,9 +111,9 @@ def main():
|
|
111 |
def generate_clips(cache_dir, info):
|
112 |
yt_id = info['metadata']['youtube_id']
|
113 |
download_file = get_youtube_video(cache_dir, yt_id)
|
|
|
114 |
|
115 |
if download_file:
|
116 |
-
transcript = info['transcript']
|
117 |
video = VideoFileClip(download_file)
|
118 |
|
119 |
for entry in transcript:
|
@@ -127,9 +127,6 @@ def generate_clips(cache_dir, info):
|
|
127 |
end_time = min(video.duration, end_time +
|
128 |
1) if end_time != 0 else video.duration
|
129 |
|
130 |
-
# Create clip
|
131 |
-
clip = video.subclip(start_time, end_time)
|
132 |
-
|
133 |
# Generate output filename
|
134 |
output_filename = (
|
135 |
f"{CLIP_DIR}{yt_id}-"
|
@@ -140,6 +137,9 @@ def generate_clips(cache_dir, info):
|
|
140 |
|
141 |
if os.path.exists(output_filename):
|
142 |
continue
|
|
|
|
|
|
|
143 |
|
144 |
# Write the clip to a file
|
145 |
clip.write_videofile(
|
@@ -151,6 +151,8 @@ def generate_clips(cache_dir, info):
|
|
151 |
video.close()
|
152 |
else:
|
153 |
print(f"Failed to download video for YouTube ID: {yt_id}")
|
|
|
|
|
154 |
|
155 |
|
156 |
if __name__ == "__main__":
|
|
|
111 |
def generate_clips(cache_dir, info):
|
112 |
yt_id = info['metadata']['youtube_id']
|
113 |
download_file = get_youtube_video(cache_dir, yt_id)
|
114 |
+
transcript = info['transcript']
|
115 |
|
116 |
if download_file:
|
|
|
117 |
video = VideoFileClip(download_file)
|
118 |
|
119 |
for entry in transcript:
|
|
|
127 |
end_time = min(video.duration, end_time +
|
128 |
1) if end_time != 0 else video.duration
|
129 |
|
|
|
|
|
|
|
130 |
# Generate output filename
|
131 |
output_filename = (
|
132 |
f"{CLIP_DIR}{yt_id}-"
|
|
|
137 |
|
138 |
if os.path.exists(output_filename):
|
139 |
continue
|
140 |
+
|
141 |
+
# Create clip
|
142 |
+
clip = video.subclip(start_time, end_time)
|
143 |
|
144 |
# Write the clip to a file
|
145 |
clip.write_videofile(
|
|
|
151 |
video.close()
|
152 |
else:
|
153 |
print(f"Failed to download video for YouTube ID: {yt_id}")
|
154 |
+
|
155 |
+
return transcript
|
156 |
|
157 |
|
158 |
if __name__ == "__main__":
|