Spaces:

abdulllah01
/

articles

Sleeping

App Files Files Community

articles / app.py

abdulllah01

Create app.py

174c7c2 verified 6 months ago

raw

history blame

2.89 kB

	import streamlit as st
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from io import BytesIO

	def extract_article_info(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract meta title
	meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None

	# Extract meta description
	meta_description = None
	meta_tag = soup.find('meta', attrs={'name': 'description'})
	if meta_tag and meta_tag.get('content'):
	meta_description = meta_tag['content']

	# Extract heading
	heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None

	# Extract subheadings
	subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]

	# Extract all text from <p> tags and add two breaks between paragraphs
	all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
	article_text = "\n\n".join(all_paragraphs)

	# Combine heading and subheadings with article text
	full_article_text = f"{heading}\n\n" if heading else ""
	for subheading in subheadings:
	full_article_text += f"{subheading}\n\n"
	full_article_text += article_text

	return full_article_text

	except requests.exceptions.RequestException as e:
	return f"Error fetching the URL: {e}"
	except Exception as e:
	return f"Error processing the content: {e}"

	def process_excel(file):
	# Read the uploaded Excel file
	df = pd.read_excel(file)

	if 'URL' in df.columns:
	# Apply extract_article_info to each URL and store the result in a new column
	df['Article Text'] = df['URL'].apply(extract_article_info)

	# Save the updated DataFrame to a BytesIO object to prepare it for download
	output = BytesIO()
	df.to_excel(output, index=False)
	output.seek(0)
	return output
	else:
	return None

	def main():
	st.title("Excel URL Processor")
	st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")

	# Upload Excel file
	uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])

	if uploaded_file:
	# Process the file
	processed_file = process_excel(uploaded_file)

	if processed_file:
	st.success("File processed successfully!")
	st.download_button(
	label="Download Modified Excel File",
	data=processed_file,
	file_name="updated_file.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)
	else:
	st.error("The uploaded file does not contain a column named 'URL'.")

	if __name__ == "__main__":
	main()