articles / app.py
abdulllah01's picture
Create app.py
174c7c2 verified
raw
history blame
2.89 kB
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO
def extract_article_info(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract meta title
meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None
# Extract meta description
meta_description = None
meta_tag = soup.find('meta', attrs={'name': 'description'})
if meta_tag and meta_tag.get('content'):
meta_description = meta_tag['content']
# Extract heading
heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
# Extract subheadings
subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
# Extract all text from <p> tags and add two breaks between paragraphs
all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
article_text = "\n\n".join(all_paragraphs)
# Combine heading and subheadings with article text
full_article_text = f"{heading}\n\n" if heading else ""
for subheading in subheadings:
full_article_text += f"{subheading}\n\n"
full_article_text += article_text
return full_article_text
except requests.exceptions.RequestException as e:
return f"Error fetching the URL: {e}"
except Exception as e:
return f"Error processing the content: {e}"
def process_excel(file):
# Read the uploaded Excel file
df = pd.read_excel(file)
if 'URL' in df.columns:
# Apply extract_article_info to each URL and store the result in a new column
df['Article Text'] = df['URL'].apply(extract_article_info)
# Save the updated DataFrame to a BytesIO object to prepare it for download
output = BytesIO()
df.to_excel(output, index=False)
output.seek(0)
return output
else:
return None
def main():
st.title("Excel URL Processor")
st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")
# Upload Excel file
uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])
if uploaded_file:
# Process the file
processed_file = process_excel(uploaded_file)
if processed_file:
st.success("File processed successfully!")
st.download_button(
label="Download Modified Excel File",
data=processed_file,
file_name="updated_file.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.error("The uploaded file does not contain a column named 'URL'.")
if __name__ == "__main__":
main()