File size: 3,922 Bytes
174c7c2
 
 
36702ae
 
 
174c7c2
 
 
36702ae
 
 
 
 
 
 
174c7c2
36702ae
 
174c7c2
 
 
 
 
 
 
 
 
 
 
 
36702ae
174c7c2
 
36702ae
174c7c2
 
 
 
36702ae
174c7c2
 
 
 
 
 
 
 
 
 
 
 
 
 
36702ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174c7c2
36702ae
 
 
174c7c2
36702ae
 
174c7c2
36702ae
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import streamlit as st
from io import BytesIO

def extract_article_info(url):
    """
    Extracts meta title, meta description, heading, subheadings, and all text in <p> tags from a blog post URL.
    Args:
        url (str): The URL of the blog post.
    Returns:
        str: A string containing the extracted information.
    """
    try:
        # Fetch the HTML content of the URL
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract meta title
        meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None

        # Extract meta description
        meta_description = None
        meta_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_tag and meta_tag.get('content'):
            meta_description = meta_tag['content']

        # Extract heading (Assuming <h1> is used for the main heading)
        heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None

        # Extract subheadings (Assuming <h2> tags are used for subheadings)
        subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]

        # Extract all text from <p> tags and add two breaks between paragraphs
        all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
        article_text = "\n\n".join(all_paragraphs)  # Add two breaks between paragraphs

        # Combine heading and subheadings with article text
        full_article_text = f"{heading}\n\n" if heading else ""
        for subheading in subheadings:
            full_article_text += f"{subheading}\n\n"
        full_article_text += article_text

        return full_article_text

    except requests.exceptions.RequestException as e:
        return f"Error fetching the URL: {e}"
    except Exception as e:
        return f"Error processing the content: {e}"

def process_file(uploaded_file):
    # Load the Excel file
    df = pd.read_excel(uploaded_file)

    # Check if 'URL' column exists
    if 'URL' not in df.columns:
        return None, "The 'URL' column is missing from the Excel file."

    # List to hold results
    results = []

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        # Submit tasks to the executor
        future_to_url = {executor.submit(extract_article_info, url): url for url in df['URL']}

        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                # Append the result to the results list
                results.append(future.result())
            except Exception as e:
                # Handle exceptions during execution
                results.append(f"Error processing the URL {url}: {e}")

    # Add the results to a new column in the DataFrame
    df['Article Text'] = results

    # Save the updated DataFrame to a BytesIO object
    output = BytesIO()
    df.to_excel(output, index=False, engine='openpyxl')
    output.seek(0)

    return output, None

# Streamlit App
st.title("Web Article Extractor")
st.markdown("Upload an Excel file with a column named 'URL' containing the links to process.")

# File upload
uploaded_file = st.file_uploader("Upload Excel file", type=["xlsx"])

if uploaded_file is not None:
    with st.spinner("Processing your file..."):
        output, error = process_file(uploaded_file)

    if error:
        st.error(error)
    else:
        st.success("File processed successfully!")
        st.download_button(
            label="Download Processed File",
            data=output,
            file_name="processed_file.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )