trhacknon ismot commited on
Commit
0b3a3af
·
0 Parent(s):

Duplicate from ismot/7testi7

Browse files

Co-authored-by: Ismo Talka <[email protected]>

Files changed (4) hide show
  1. .gitattributes +27 -0
  2. README.md +14 -0
  3. app.py +96 -0
  4. requirements.txt +3 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Article Scraper
3
+ emoji: 🦀
4
+ colorFrom: gray
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: ismot/7testi7
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import requests
4
+ from streamlit_lottie import st_lottie
5
+
6
+ def main() -> None:
7
+
8
+ # ----- Loading Assets ----
9
+
10
+ def load_lottieurl(lottie_url:str):
11
+ r = requests.get(url=lottie_url)
12
+ return r.json() if r.status_code == 200 else None
13
+
14
+ def fetch(url):
15
+ try:
16
+ result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
17
+ return result.json()
18
+ except Exception:
19
+ return {}
20
+
21
+ st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")
22
+
23
+ lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")
24
+
25
+ # ----- Introduction --------
26
+ with st.container():
27
+ st.subheader("Article Scraper")
28
+ st.title("A Digital News / Article Information Extraction Application")
29
+ st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
30
+ st.write("This service can be utilised in the data collection / curation process of data science workflow")
31
+ st.write("[My Website >](https://www.rahulnenavath.co.in/)")
32
+ st.subheader(f'About Article Scraper API:')
33
+ st.write(
34
+ """
35
+ - Article scraper API is deployed on AWS Lambda using AWS ECR container deployment
36
+ - CI/CD workflow is implemented via GitHub Actions to push the latest docker build to AWS ECR
37
+ - Check out the API codebase on [my GitHub >](https://github.com/RahulNenavath/Article-Scraper)
38
+ API Tech Stack: Python, Beautifulsoup, AWS Lambda, AWS ECR, Docker, GitHub Actions (CI/CD)
39
+ """
40
+ )
41
+
42
+ with st.container():
43
+ st.write("---")
44
+ left_col, right_col = st.columns(2)
45
+
46
+ with left_col:
47
+ st.header("How it works?")
48
+ st.write("##")
49
+ st.write('**Input**: Article URL')
50
+ st.write('**Output**: Extracted Article Information')
51
+ st.write(
52
+ """
53
+ **Working**:
54
+ - Download the HTML content from the given Article URL
55
+ - Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
56
+ - Arrange Information appropriately
57
+ - Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
58
+ """
59
+ )
60
+ st.warning(f'Note: Web scraping is highly dependent on the Article HTML structure. Hence one might have to further clean the scraped content')
61
+
62
+ with right_col:
63
+ st_lottie(lottie_animation, height=500)
64
+
65
+ with st.form("my_form"):
66
+ article_url = st.text_input("Article URL", value="", key="article_url")
67
+
68
+ submitted = st.form_submit_button("Submit")
69
+
70
+ if submitted:
71
+ with st.spinner('Scraping Information ...'):
72
+ data = fetch(url=article_url)
73
+
74
+ if data:
75
+ st.success("Request is Successful")
76
+ content = data.get("scraped_content")
77
+ st.write("---")
78
+ st.subheader(f'Extracted Article Information')
79
+ st.write(f"**Article Title:** {content.get('article_title')}")
80
+ st.write(f"**Author:** {content.get('author')}")
81
+ st.write(f"**Published Date:** {content.get('publish_date')}")
82
+ st.write(f"**Description:** {content.get('description')}")
83
+ st.write(f"**Content:** {content.get('article_content')}")
84
+ st.write(f"**Article URL:** {content.get('article_url')}")
85
+ st.write(f"**Canonical URL:** {content.get('canonical_url')}")
86
+ st.write(f"**Publisher Name:** {content.get('publisher_name')}")
87
+ st.write(f"**Article Image:** {content.get('image')}")
88
+ st.write(f"**Article Keywords:** {content.get('keywords')}")
89
+ st.write(f"**Video URL:** {content.get('video_url')}")
90
+ st.write(f"**Audio URL:** {content.get('audio_url')}")
91
+ else:
92
+ st.error("Request Failed")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit-lottie==0.0.3
2
+ requests
3
+ streamlit