Spaces:

abdulllah01
/

articles

Sleeping

App Files Files Community

abdulllah01 commited on Jan 7

Commit

174c7c2

verified ·

1 Parent(s): 8c54afc

Create app.py

Browse files

Files changed (1) hide show

app.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import streamlit as st
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from io import BytesIO
+def extract_article_info(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract meta title
+        meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None
+        # Extract meta description
+        meta_description = None
+        meta_tag = soup.find('meta', attrs={'name': 'description'})
+        if meta_tag and meta_tag.get('content'):
+            meta_description = meta_tag['content']
+        # Extract heading
+        heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
+        # Extract subheadings
+        subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
+        # Extract all text from <p> tags and add two breaks between paragraphs
+        all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
+        article_text = "\n\n".join(all_paragraphs)
+        # Combine heading and subheadings with article text
+        full_article_text = f"{heading}\n\n" if heading else ""
+        for subheading in subheadings:
+            full_article_text += f"{subheading}\n\n"
+        full_article_text += article_text
+        return full_article_text
+    except requests.exceptions.RequestException as e:
+        return f"Error fetching the URL: {e}"
+    except Exception as e:
+        return f"Error processing the content: {e}"
+def process_excel(file):
+    # Read the uploaded Excel file
+    df = pd.read_excel(file)
+    if 'URL' in df.columns:
+        # Apply extract_article_info to each URL and store the result in a new column
+        df['Article Text'] = df['URL'].apply(extract_article_info)
+        # Save the updated DataFrame to a BytesIO object to prepare it for download
+        output = BytesIO()
+        df.to_excel(output, index=False)
+        output.seek(0)
+        return output
+    else:
+        return None
+def main():
+    st.title("Excel URL Processor")
+    st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")
+    # Upload Excel file
+    uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])
+    if uploaded_file:
+        # Process the file
+        processed_file = process_excel(uploaded_file)
+        if processed_file:
+            st.success("File processed successfully!")
+            st.download_button(
+                label="Download Modified Excel File",
+                data=processed_file,
+                file_name="updated_file.xlsx",
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )
+        else:
+            st.error("The uploaded file does not contain a column named 'URL'.")
+if __name__ == "__main__":
+    main()