Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from io import BytesIO | |
def extract_article_info(url): | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract meta title | |
meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None | |
# Extract meta description | |
meta_description = None | |
meta_tag = soup.find('meta', attrs={'name': 'description'}) | |
if meta_tag and meta_tag.get('content'): | |
meta_description = meta_tag['content'] | |
# Extract heading | |
heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None | |
# Extract subheadings | |
subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')] | |
# Extract all text from <p> tags and add two breaks between paragraphs | |
all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] | |
article_text = "\n\n".join(all_paragraphs) | |
# Combine heading and subheadings with article text | |
full_article_text = f"{heading}\n\n" if heading else "" | |
for subheading in subheadings: | |
full_article_text += f"{subheading}\n\n" | |
full_article_text += article_text | |
return full_article_text | |
except requests.exceptions.RequestException as e: | |
return f"Error fetching the URL: {e}" | |
except Exception as e: | |
return f"Error processing the content: {e}" | |
def process_excel(file): | |
# Read the uploaded Excel file | |
df = pd.read_excel(file) | |
if 'URL' in df.columns: | |
# Apply extract_article_info to each URL and store the result in a new column | |
df['Article Text'] = df['URL'].apply(extract_article_info) | |
# Save the updated DataFrame to a BytesIO object to prepare it for download | |
output = BytesIO() | |
df.to_excel(output, index=False) | |
output.seek(0) | |
return output | |
else: | |
return None | |
def main(): | |
st.title("Excel URL Processor") | |
st.markdown("Upload an Excel file with a column named 'URL' to extract article information.") | |
# Upload Excel file | |
uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"]) | |
if uploaded_file: | |
# Process the file | |
processed_file = process_excel(uploaded_file) | |
if processed_file: | |
st.success("File processed successfully!") | |
st.download_button( | |
label="Download Modified Excel File", | |
data=processed_file, | |
file_name="updated_file.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("The uploaded file does not contain a column named 'URL'.") | |
if __name__ == "__main__": | |
main() | |