abdulllah01 commited on
Commit
174c7c2
·
verified ·
1 Parent(s): 8c54afc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from io import BytesIO
6
+
7
+ def extract_article_info(url):
8
+ try:
9
+ response = requests.get(url)
10
+ response.raise_for_status()
11
+ soup = BeautifulSoup(response.text, 'html.parser')
12
+
13
+ # Extract meta title
14
+ meta_title = soup.find('title').get_text(strip=True) if soup.find('title') else None
15
+
16
+ # Extract meta description
17
+ meta_description = None
18
+ meta_tag = soup.find('meta', attrs={'name': 'description'})
19
+ if meta_tag and meta_tag.get('content'):
20
+ meta_description = meta_tag['content']
21
+
22
+ # Extract heading
23
+ heading = soup.find('h1').get_text(strip=True) if soup.find('h1') else None
24
+
25
+ # Extract subheadings
26
+ subheadings = [h2.get_text(strip=True) for h2 in soup.find_all('h2')]
27
+
28
+ # Extract all text from <p> tags and add two breaks between paragraphs
29
+ all_paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
30
+ article_text = "\n\n".join(all_paragraphs)
31
+
32
+ # Combine heading and subheadings with article text
33
+ full_article_text = f"{heading}\n\n" if heading else ""
34
+ for subheading in subheadings:
35
+ full_article_text += f"{subheading}\n\n"
36
+ full_article_text += article_text
37
+
38
+ return full_article_text
39
+
40
+ except requests.exceptions.RequestException as e:
41
+ return f"Error fetching the URL: {e}"
42
+ except Exception as e:
43
+ return f"Error processing the content: {e}"
44
+
45
+ def process_excel(file):
46
+ # Read the uploaded Excel file
47
+ df = pd.read_excel(file)
48
+
49
+ if 'URL' in df.columns:
50
+ # Apply extract_article_info to each URL and store the result in a new column
51
+ df['Article Text'] = df['URL'].apply(extract_article_info)
52
+
53
+ # Save the updated DataFrame to a BytesIO object to prepare it for download
54
+ output = BytesIO()
55
+ df.to_excel(output, index=False)
56
+ output.seek(0)
57
+ return output
58
+ else:
59
+ return None
60
+
61
+ def main():
62
+ st.title("Excel URL Processor")
63
+ st.markdown("Upload an Excel file with a column named 'URL' to extract article information.")
64
+
65
+ # Upload Excel file
66
+ uploaded_file = st.file_uploader("Choose an Excel file", type=["xlsx"])
67
+
68
+ if uploaded_file:
69
+ # Process the file
70
+ processed_file = process_excel(uploaded_file)
71
+
72
+ if processed_file:
73
+ st.success("File processed successfully!")
74
+ st.download_button(
75
+ label="Download Modified Excel File",
76
+ data=processed_file,
77
+ file_name="updated_file.xlsx",
78
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
79
+ )
80
+ else:
81
+ st.error("The uploaded file does not contain a column named 'URL'.")
82
+
83
+ if __name__ == "__main__":
84
+ main()