speech_recognize

Runtime error

App Files Files Community

speech_recognize / app.py

mr2along

Update app.py

e45342e verified 9 months ago

raw

history blame

7.06 kB

	import requests
	from bs4 import BeautifulSoup
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import re
	import pypub
	import os
	import time # Thư viện để tính thời gian
	import gradio as gr # Thêm thư viện Gradio

	# Hàm để phân tích URL và tạo api_url và base_url
	def parse_story_url(story_url):
	# Cố gắng tìm kiếm tên và ID truyện từ URL
	match = re.search(r"https://truyenfull\.tv/([^/]+)(?:-f\d+)?\.(\d+)/", story_url)
	if match:
	story_name = match.group(1) # Trích xuất tên truyện
	story_id = match.group(2) # Trích xuất ID truyện
	api_url = f"https://truyenfull.tv/api/chapters/{story_id}/" # Tạo URL API
	base_url = f"https://truyenfull.tv/{story_name}/chuong-" # Tạo URL cơ bản cho các chương
	return story_name, story_id, api_url, base_url # Trả về thông tin đã trích xuất và tạo
	else:
	raise ValueError("URL không hợp lệ") # Ném lỗi nếu định dạng URL không hợp lệ

	# Hàm để lấy thông tin các chương từ API
	def get_chapter_info(api_url):
	response = requests.get(api_url)
	response.raise_for_status() # Ném lỗi nếu không thành công
	data = response.json()
	return data.get('items', [])

	# Hàm để lấy nội dung của một chương dựa trên thứ tự chương
	def get_chapter_content(chapter_index, base_url):
	chapter_url = base_url + str(chapter_index) + ".html"
	try:
	response = requests.get(chapter_url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')
	content_div = soup.find('div', id='chapter-c', class_='chapter-c')
	return content_div.get_text(separator='\n').strip() if content_div else "Không tìm thấy nội dung chương."
	except Exception as e:
	print(f"Lỗi khi lấy nội dung chương {chapter_index}: {e}")
	return "Không thể lấy nội dung."

	# Hàm để lấy nội dung tất cả các chương và lưu vào file
	def get_all_chapters_content(story_url, start_chapter, max_chapters):
	story_name, story_id, api_url, base_url = parse_story_url(story_url)

	chapters = get_chapter_info(api_url)
	if not chapters:
	return "Không tìm thấy chương nào."

	# Giới hạn số chương tải xuống
	chapters_to_load = chapters[start_chapter - 1:start_chapter - 1 + max_chapters]
	chapter_contents = [] # Danh sách lưu nội dung các chương theo thứ tự
	total_time = 0 # Biến để tính tổng thời gian thực hiện từng chương

	# Sử dụng ThreadPoolExecutor để lấy nội dung các chương song song
	with ThreadPoolExecutor(max_workers=10) as executor:
	future_to_chapter = {executor.submit(get_chapter_content, idx + 1, base_url): idx + 1 for idx in range(len(chapters_to_load))}
	for future in as_completed(future_to_chapter):
	chapter_index = future_to_chapter[future]
	start_time = time.time() # Bắt đầu đo thời gian cho mỗi chương
	try:
	content = future.result()
	# Lưu nội dung chương vào danh sách theo thứ tự
	chapter_contents.append((chapter_index, content, chapters[chapter_index - 1]['chapter_name'])) # Thêm tiêu đề chương
	print(f"Đã lưu chương {chapter_index}")
	except Exception as e:
	print(f"Lỗi khi lấy nội dung chương {chapter_index}: {e}")
	end_time = time.time() # Kết thúc đo thời gian
	chapter_time = end_time - start_time
	total_time += chapter_time # Cộng dồn thời gian cho mỗi chương
	print(f"Thời gian tải chương {chapter_index}: {chapter_time:.2f} giây")

	# Tính tổng thời gian và thời gian trung bình cho mỗi chương
	avg_time_per_chapter = total_time / max_chapters if max_chapters > 0 else 0
	print(f"Tổng thời gian tải {max_chapters} chương: {total_time:.2f} giây")
	print(f"Thời gian trung bình cho mỗi chương: {avg_time_per_chapter:.2f} giây")

	# Ghi nội dung các chương vào file theo thứ tự đã lưu
	chapter_contents.sort(key=lambda x: x[0]) # Sắp xếp theo chỉ số chương
	output_file = f"{story_name}.txt"
	with open(output_file, 'w', encoding='utf-8') as f:
	for chapter_index, content, chapter_title in chapter_contents:
	chapter_name = f"{chapter_title}" # Tạo tên chương với tiêu đề
	f.write(f"{chapter_name}\n\n")
	f.write(f"{content}\n")
	f.write("-" * 50 + "\n")

	# Tạo file EPUB từ nội dung đã lưu
	epubfile=create_epub_from_chapters(chapter_contents, story_name)

	# Trả về kết quả
	return [f"Đã tải thành công {max_chapters} chương. Tổng thời gian: {total_time:.2f} giây, Thời gian trung bình: {avg_time_per_chapter:.2f} giây. File TXT: {output_file}",epubfile]

	# Hàm để tạo file EPUB từ nội dung các chương
	def create_epub_from_chapters(chapter_contents, story_name):
	try:
	# Tạo đối tượng Epub
	my_epub = pypub.Epub(story_name)

	# Thêm từng chương vào EPUB
	for chapter_index, content, chapter_title in chapter_contents:
	# Tạo chương từ nội dung đã có
	my_chapter = pypub.create_chapter_from_text(content, chapter_title)
	my_epub.add_chapter(my_chapter)

	# Lưu file EPUB
	output_directory = f"./{story_name}.epub"
	epubfile=my_epub.create(output_directory) # Lưu file EPUB
	print(f"Đã tạo file EPUB: {output_directory}")

	except Exception as e:
	print(f"Lỗi khi tạo file EPUB: {e}")
	return epubfile
	# Giao diện Gradio
	def gradio_interface(story_url, start_chapter, max_chapters):
	# Bắt đầu đo thời gian cho toàn bộ quá trình
	start_total_time = time.time()

	# Gọi hàm tải và xử lý nội dung
	result = get_all_chapters_content(story_url, int(start_chapter), int(max_chapters))

	# Kết thúc đo thời gian
	end_total_time = time.time()

	# Tính tổng thời gian cho toàn bộ quá trình
	total_process_time = end_total_time - start_total_time
	result += f"\nTổng thời gian hoàn thành tất cả các chức năng: {total_process_time:.2f} giây"
	return result

	# Tạo giao diện với Gradio
	gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Textbox(label="URL Truyện", placeholder="Nhập URL của truyện từ truyenfull.tv"),
	gr.Textbox(label="Số chương bắt đầu", placeholder="Nhập số chương bắt đầu"),
	gr.Textbox(label="Số chương muốn tải", placeholder="Nhập số chương muốn tải")
	],
	outputs=["text","file"],
	title="Truyện Full Downloader",
	description="Công cụ tải truyện từ truyenfull.tv và tạo file EPUB."
	).launch()