import streamlit as st import pandas as pd import requests from bs4 import BeautifulSoup from io import BytesIO def extract_article_info(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract meta title meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None # Extract meta description meta_description = None meta_tag = soup.find('meta', attrs={'name': 'description'}) if meta_tag and meta_tag.get('content'): meta_description = meta_tag['content'] # Extract heading heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None # Extract subheadings subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')] # Extract all text from

tags and add two breaks between paragraphs all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] article_text = "\n\n".join(all_paragraphs) # Combine heading and subheadings with article text full_article_text = f"{heading}\n\n" if heading else "" for subheading in subheadings: full_article_text += f"{subheading}\n\n" full_article_text += article_text return full_article_text except requests.exceptions.RequestException as e: return f"Error fetching the URL: {e}" except Exception as e: return f"Error processing the content: {e}" def process_excel(file): # Read the uploaded Excel file df = pd.read_excel(file) if 'URL' in df.columns: # Apply extract_article_info to each URL and store the result in a new column df['Article Text'] = df['URL'].apply(extract_article_info) # Save the updated DataFrame to a BytesIO object to prepare it for download output = BytesIO() df.to_excel(output, index=False) output.seek(0) return output else: return None def main(): st.title("Excel URL Processor") st.markdown("Upload an Excel file with a column named 'URL' to extract article information.") # Upload Excel file uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"]) if uploaded_file: # Process the file processed_file = process_excel(uploaded_file) if processed_file: st.success("File processed successfully!") st.download_button( label="Download Modified Excel File", data=processed_file, file_name="updated_file.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.error("The uploaded file does not contain a column named 'URL'.") if __name__ == "__main__": main()