File size: 2,892 Bytes
174c7c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO

def extract_article_info(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract meta title
        meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None

        # Extract meta description
        meta_description = None
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag and meta_tag.get('content'):
            meta_description = meta_tag['content']

        # Extract heading 
        heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None

        # Extract subheadings 
        subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]

        # Extract all text from <p> tags and add two breaks between paragraphs
        all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
        article_text = "\n\n".join(all_paragraphs)  

        # Combine heading and subheadings with article text
        full_article_text = f"{heading}\n\n" if heading else ""
        for subheading in subheadings:
            full_article_text += f"{subheading}\n\n"
        full_article_text += article_text

        return full_article_text

    except requests.exceptions.RequestException as e:
        return f"Error fetching the URL: {e}"
    except Exception as e:
        return f"Error processing the content: {e}"

def process_excel(file):
    # Read the uploaded Excel file
    df = pd.read_excel(file)

    if 'URL' in df.columns:
        # Apply extract_article_info to each URL and store the result in a new column
        df['Article Text'] = df['URL'].apply(extract_article_info)

        # Save the updated DataFrame to a BytesIO object to prepare it for download
        output = BytesIO()
        df.to_excel(output, index=False)
        output.seek(0)
        return output
    else:
        return None

def main():
    st.title("Excel URL Processor")
    st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")

    # Upload Excel file
    uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])

    if uploaded_file:
        # Process the file
        processed_file = process_excel(uploaded_file)

        if processed_file:
            st.success("File processed successfully!")
            st.download_button(
                label="Download Modified Excel File",
                data=processed_file,
                file_name="updated_file.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        else:
            st.error("The uploaded file does not contain a column named 'URL'.")

if __name__ == "__main__":
    main()