File size: 4,273 Bytes
0b3a3af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import os
import streamlit as st
import requests
from streamlit_lottie import st_lottie
def main() -> None:
# ----- Loading Assets ----
def load_lottieurl(lottie_url:str):
r = requests.get(url=lottie_url)
return r.json() if r.status_code == 200 else None
def fetch(url):
try:
result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
return result.json()
except Exception:
return {}
st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")
lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")
# ----- Introduction --------
with st.container():
st.subheader("Article Scraper")
st.title("A Digital News / Article Information Extraction Application")
st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
st.write("This service can be utilised in the data collection / curation process of data science workflow")
st.write("[My Website >](https://www.rahulnenavath.co.in/)")
st.subheader(f'About Article Scraper API:')
st.write(
"""
- Article scraper API is deployed on AWS Lambda using AWS ECR container deployment
- CI/CD workflow is implemented via GitHub Actions to push the latest docker build to AWS ECR
- Check out the API codebase on [my GitHub >](https://github.com/RahulNenavath/Article-Scraper)
API Tech Stack: Python, Beautifulsoup, AWS Lambda, AWS ECR, Docker, GitHub Actions (CI/CD)
"""
)
with st.container():
st.write("---")
left_col, right_col = st.columns(2)
with left_col:
st.header("How it works?")
st.write("##")
st.write('**Input**: Article URL')
st.write('**Output**: Extracted Article Information')
st.write(
"""
**Working**:
- Download the HTML content from the given Article URL
- Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
- Arrange Information appropriately
- Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
"""
)
st.warning(f'Note: Web scraping is highly dependent on the Article HTML structure. Hence one might have to further clean the scraped content')
with right_col:
st_lottie(lottie_animation, height=500)
with st.form("my_form"):
article_url = st.text_input("Article URL", value="", key="article_url")
submitted = st.form_submit_button("Submit")
if submitted:
with st.spinner('Scraping Information ...'):
data = fetch(url=article_url)
if data:
st.success("Request is Successful")
content = data.get("scraped_content")
st.write("---")
st.subheader(f'Extracted Article Information')
st.write(f"**Article Title:** {content.get('article_title')}")
st.write(f"**Author:** {content.get('author')}")
st.write(f"**Published Date:** {content.get('publish_date')}")
st.write(f"**Description:** {content.get('description')}")
st.write(f"**Content:** {content.get('article_content')}")
st.write(f"**Article URL:** {content.get('article_url')}")
st.write(f"**Canonical URL:** {content.get('canonical_url')}")
st.write(f"**Publisher Name:** {content.get('publisher_name')}")
st.write(f"**Article Image:** {content.get('image')}")
st.write(f"**Article Keywords:** {content.get('keywords')}")
st.write(f"**Video URL:** {content.get('video_url')}")
st.write(f"**Audio URL:** {content.get('audio_url')}")
else:
st.error("Request Failed")
if __name__ == "__main__":
main() |