File size: 4,273 Bytes
0b3a3af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import streamlit as st
import requests
from streamlit_lottie import st_lottie

def main() -> None:

    # ----- Loading Assets ----

    def load_lottieurl(lottie_url:str):
        r = requests.get(url=lottie_url)
        return r.json() if r.status_code == 200 else None

    def fetch(url):
        try:
            result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
            return result.json()
        except Exception:
            return {}
    
    st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")

    lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")

    # ----- Introduction --------
    with st.container():
        st.subheader("Article Scraper")
        st.title("A Digital News / Article Information Extraction Application")
        st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
        st.write("This service can be utilised in the data collection / curation process of data science workflow")
        st.write("[My Website >](https://www.rahulnenavath.co.in/)")
        st.subheader(f'About Article Scraper API:')
        st.write(
            """
                - Article scraper API is deployed on AWS Lambda using AWS ECR container deployment
                - CI/CD workflow is implemented via GitHub Actions to push the latest docker build to AWS ECR
                - Check out the API codebase on [my GitHub >](https://github.com/RahulNenavath/Article-Scraper)
                API Tech Stack: Python, Beautifulsoup, AWS Lambda, AWS ECR, Docker, GitHub Actions (CI/CD)
            """
        )

    with st.container():
        st.write("---")
        left_col, right_col = st.columns(2)

        with left_col:
            st.header("How it works?")
            st.write("##")
            st.write('**Input**: Article URL')
            st.write('**Output**: Extracted Article Information')
            st.write(
                """
                **Working**:
                - Download the HTML content from the given Article URL
                - Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
                - Arrange Information appropriately
                - Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
                """
            )
            st.warning(f'Note: Web scraping is highly dependent on the Article HTML structure. Hence one might have to further clean the scraped content')
        
        with right_col:
            st_lottie(lottie_animation, height=500)

    with st.form("my_form"):
        article_url = st.text_input("Article URL", value="", key="article_url")

        submitted = st.form_submit_button("Submit")

        if submitted:
            with st.spinner('Scraping Information ...'):
                data = fetch(url=article_url)
        
            if data:
                st.success("Request is Successful")
                content = data.get("scraped_content")
                st.write("---")
                st.subheader(f'Extracted Article Information')
                st.write(f"**Article Title:** {content.get('article_title')}")
                st.write(f"**Author:** {content.get('author')}")
                st.write(f"**Published Date:** {content.get('publish_date')}")
                st.write(f"**Description:** {content.get('description')}")
                st.write(f"**Content:** {content.get('article_content')}")
                st.write(f"**Article URL:** {content.get('article_url')}")
                st.write(f"**Canonical URL:** {content.get('canonical_url')}")
                st.write(f"**Publisher Name:** {content.get('publisher_name')}")
                st.write(f"**Article Image:** {content.get('image')}")
                st.write(f"**Article Keywords:** {content.get('keywords')}")
                st.write(f"**Video URL:** {content.get('video_url')}")
                st.write(f"**Audio URL:** {content.get('audio_url')}")
            else:
                st.error("Request Failed")


if __name__ == "__main__":
    main()