Spaces:

NoticIA-Col
/

Generador-Noticias

Sleeping

App Files Files Community

Generador-Noticias / app.py

CamiloVega

Update app.py

a6f5353 verified 8 months ago

raw

history blame

13.2 kB

	import os
	import openai
	import whisper
	import tempfile
	import gradio as gr
	from pydub import AudioSegment
	import fitz # PyMuPDF for handling PDFs
	import docx # For handling .docx files
	import pandas as pd # For handling .xlsx and .csv files
	import requests
	from bs4 import BeautifulSoup
	from moviepy.editor import VideoFileClip
	import yt_dlp
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Configure your OpenAI API key
	openai.api_key = os.getenv("OPENAI_API_KEY")

	# Load the highest quality Whisper model once
	model = whisper.load_model("large")

	def download_social_media_video(url):
	"""Downloads a video from social media."""
	ydl_opts = {
	'format': 'bestaudio/best',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'outtmpl': '%(id)s.%(ext)s',
	}
	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info_dict = ydl.extract_info(url, download=True)
	audio_file = f"{info_dict['id']}.mp3"
	logger.info(f"Video successfully downloaded: {audio_file}")
	return audio_file
	except Exception as e:
	logger.error(f"Error downloading video: {str(e)}")
	raise

	def convert_video_to_audio(video_file):
	"""Converts a video file to audio."""
	try:
	video = VideoFileClip(video_file)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
	video.audio.write_audiofile(temp_file.name)
	logger.info(f"Video converted to audio: {temp_file.name}")
	return temp_file.name
	except Exception as e:
	logger.error(f"Error converting video to audio: {str(e)}")
	raise

	def preprocess_audio(audio_file):
	"""Preprocesses the audio file to improve quality."""
	try:
	audio = AudioSegment.from_file(audio_file)
	audio = audio.apply_gain(-audio.dBFS + (-20))
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
	audio.export(temp_file.name, format="mp3")
	logger.info(f"Audio preprocessed: {temp_file.name}")
	return temp_file.name
	except Exception as e:
	logger.error(f"Error preprocessing audio file: {str(e)}")
	raise

	def transcribe_audio(file):
	"""Transcribes an audio or video file."""
	try:
	if isinstance(file, str) and file.startswith('http'):
	logger.info(f"Downloading social media video: {file}")
	file_path = download_social_media_video(file)
	elif isinstance(file, str) and file.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
	logger.info(f"Converting local video to audio: {file}")
	file_path = convert_video_to_audio(file)
	else:
	logger.info(f"Preprocessing audio file: {file}")
	file_path = preprocess_audio(file)

	logger.info(f"Transcribing audio: {file_path}")
	result = model.transcribe(file_path)
	transcription = result.get("text", "Error in transcription")
	logger.info(f"Transcription completed: {transcription[:50]}...")
	return transcription
	except Exception as e:
	logger.error(f"Error processing file: {str(e)}")
	return f"Error processing file: {str(e)}"

	def read_document(document_path):
	"""Reads content from PDF, DOCX, XLSX or CSV documents."""
	try:
	if document_path.endswith(".pdf"):
	doc = fitz.open(document_path)
	return "\n".join([page.get_text() for page in doc])
	elif document_path.endswith(".docx"):
	doc = docx.Document(document_path)
	return "\n".join([paragraph.text for paragraph in doc.paragraphs])
	elif document_path.endswith(".xlsx"):
	return pd.read_excel(document_path).to_string()
	elif document_path.endswith(".csv"):
	return pd.read_csv(document_path).to_string()
	else:
	return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
	except Exception as e:
	return f"Error reading document: {str(e)}"

	def read_url(url):
	"""Reads content from a URL."""
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')
	return soup.get_text()
	except Exception as e:
	return f"Error reading URL: {str(e)}"

	def process_social_content(url):
	"""Processes content from a social media URL, handling both text and video."""
	try:
	# First, try to read content as text
	text_content = read_url(url)

	# Then, try to process as video
	try:
	video_content = transcribe_audio(url)
	except Exception:
	video_content = None

	return {
	"text": text_content,
	"video": video_content
	}
	except Exception as e:
	logger.error(f"Error processing social content: {str(e)}")
	return None

	def generate_news(instructions, facts, size, tone, *args):
	"""Generates a news article from instructions, facts, URLs, documents, transcriptions, and social media content."""
	knowledge_base = {
	"instructions": instructions,
	"facts": facts,
	"document_content": [],
	"audio_data": [],
	"url_content": [],
	"social_content": []
	}
	num_audios = 5 * 3 # 5 audios/videos * 3 fields (file, name, position)
	num_social_urls = 3 * 3 # 3 social media URLs * 3 fields (URL, name, context)
	num_urls = 5 # 5 general URLs
	audios = args[:num_audios]
	social_urls = args[num_audios:num_audios+num_social_urls]
	urls = args[num_audios+num_social_urls:num_audios+num_social_urls+num_urls]
	documents = args[num_audios+num_social_urls+num_urls:]

	for url in urls:
	if url:
	knowledge_base["url_content"].append(read_url(url))

	for document in documents:
	if document is not None:
	knowledge_base["document_content"].append(read_document(document.name))

	for i in range(0, len(audios), 3):
	audio_file, name, position = audios[i:i+3]
	if audio_file is not None:
	knowledge_base["audio_data"].append({"audio": audio_file, "name": name, "position": position})

	for i in range(0, len(social_urls), 3):
	social_url, social_name, social_context = social_urls[i:i+3]
	if social_url:
	social_content = process_social_content(social_url)
	if social_content:
	knowledge_base["social_content"].append({
	"url": social_url,
	"name": social_name,
	"context": social_context,
	"text": social_content["text"],
	"video": social_content["video"]
	})
	logger.info(f"Social media content processed: {social_url}")

	transcriptions_text, raw_transcriptions = "", ""

	for idx, data in enumerate(knowledge_base["audio_data"]):
	if data["audio"] is not None:
	transcription = transcribe_audio(data["audio"])
	transcription_text = f'"{transcription}" - {data["name"]}, {data["position"]}'
	raw_transcription = f'[Audio/Video {idx + 1}]: "{transcription}" - {data["name"]}, {data["position"]}'
	transcriptions_text += transcription_text + "\n"
	raw_transcriptions += raw_transcription + "\n\n"

	for data in knowledge_base["social_content"]:
	if data["text"]:
	transcription_text = f'[Social media text]: "{data["text"][:200]}..." - {data["name"]}, {data["context"]}'
	transcriptions_text += transcription_text + "\n"
	raw_transcriptions += transcription_text + "\n\n"
	if data["video"]:
	transcription_video = f'[Social media video]: "{data["video"]}" - {data["name"]}, {data["context"]}'
	transcriptions_text += transcription_video + "\n"
	raw_transcriptions += transcription_video + "\n\n"

	document_content = "\n\n".join(knowledge_base["document_content"])
	url_content = "\n\n".join(knowledge_base["url_content"])

	internal_prompt = """
	Instructions for the model:
	- Follow news article principles: answer the 5 Ws in the first paragraph (Who?, What?, When?, Where?, Why?).
	- Ensure at least 80% of quotes are direct and in quotation marks.
	- The remaining 20% can be indirect quotes.
	- Don't invent new information.
	- Be rigorous with provided facts.
	- When processing uploaded documents, extract and highlight important quotes and testimonials from sources.
	- When processing uploaded documents, extract and highlight key figures.
	- Avoid using the date at the beginning of the news body. Start directly with the 5Ws.
	- Include social media content relevantly, citing the source and providing proper context.
	- Make sure to relate the provided context for social media content with its corresponding transcription or text.
	"""

	prompt = f"""
	{internal_prompt}
	Write a news article with the following information, including a title, a 15-word hook (additional information that complements the title), and the content body with {size} words. The tone should be {tone}.
	Instructions: {knowledge_base["instructions"]}
	Facts: {knowledge_base["facts"]}
	Additional content from documents: {document_content}
	Additional content from URLs: {url_content}
	Use the following transcriptions as direct and indirect quotes (without changing or inventing content):
	{transcriptions_text}
	"""

	try:
	response = openai.ChatCompletion.create(
	model="gpt-4o-mini",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.1
	)
	news = response['choices'][0]['message']['content']
	return news, raw_transcriptions
	except Exception as e:
	logger.error(f"Error generating news article: {str(e)}")
	return f"Error generating news article: {str(e)}", ""

	with gr.Blocks() as demo:
	gr.Markdown("## All-in-One News Generator")

	# Add tool description and attribution
	gr.Markdown("""
	### About this tool

	This AI-powered news generator helps journalists and content creators produce news articles by processing multiple types of input:
	- Audio and video files with automatic transcription
	- Social media content
	- Documents (PDF, DOCX, XLSX, CSV)
	- Web URLs

	The tool uses advanced AI to generate well-structured news articles following journalistic principles and maintaining the integrity of source quotes.

	Created by [Camilo Vega](https://www.linkedin.com/in/camilo-vega-169084b1/), AI Consultant
	""")

	with gr.Row():
	with gr.Column(scale=2):
	instructions = gr.Textbox(label="News article instructions", lines=2)
	facts = gr.Textbox(label="Describe the news facts", lines=4)
	size = gr.Number(label="Content body size (in words)", value=100)
	tone = gr.Dropdown(label="News tone", choices=["serious", "neutral", "lighthearted"], value="neutral")
	with gr.Column(scale=3):
	inputs_list = [instructions, facts, size, tone]
	with gr.Tabs():
	for i in range(1, 6):
	with gr.TabItem(f"Audio/Video {i}"):
	file = gr.File(label=f"Audio/Video {i}", type="filepath", file_types=["audio", "video"])
	name = gr.Textbox(label="Name", scale=1)
	position = gr.Textbox(label="Position", scale=1)
	inputs_list.extend([file, name, position])
	for i in range(1, 4):
	with gr.TabItem(f"Social Media {i}"):
	social_url = gr.Textbox(label=f"Social media URL {i}", lines=1)
	social_name = gr.Textbox(label=f"Person/account name {i}", scale=1)
	social_context = gr.Textbox(label=f"Content context {i}", lines=2)
	inputs_list.extend([social_url, social_name, social_context])
	for i in range(1, 6):
	with gr.TabItem(f"URL {i}"):
	url = gr.Textbox(label=f"URL {i}", lines=1)
	inputs_list.append(url)
	for i in range(1, 6):
	with gr.TabItem(f"Document {i}"):
	document = gr.File(label=f"Document {i}", type="filepath", file_count="single")
	inputs_list.append(document)

	gr.Markdown("---") # Visual separator

	with gr.Row():
	transcriptions_output = gr.Textbox(label="Transcriptions", lines=10)

	gr.Markdown("---") # Visual separator

	with gr.Row():
	generate = gr.Button("Generate Draft")
	with gr.Row():
	news_output = gr.Textbox(label="Generated Draft", lines=20)

	generate.click(fn=generate_news, inputs=inputs_list, outputs=[news_output, transcriptions_output])

	demo.launch(share=True)