tracinginsights commited on
Commit
eb67193
·
1 Parent(s): 5ca1905

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -0
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import isort
4
+ import black
5
+ import flair
6
+ import time
7
+ from bs4 import BeautifulSoup
8
+
9
+
10
+
11
+ URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
12
+
13
+ def get_xml(url):
14
+ # xpath is only for formula1
15
+ # use urllib.parse to check for formula1.com website or other news
16
+ xml = pd.read_xml(url,xpath='channel/item')
17
+
18
+ def check_updates(every=60):
19
+ while True:
20
+ time.sleep(every)
21
+ latest_xml = get_xml()
22
+ if ~previous_xml.equals(latest_xml):
23
+ print('New articles found')
24
+ new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
25
+ for article in new_articles_df.iterrows():
26
+ link = row[1]["guid"]
27
+ request = requests.get(link)
28
+ soup = BeautifulSoup(request.content, "html.parser")
29
+ # class_ below will be different for different websites
30
+ s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
31
+ lines = s.find_all("p")
32
+ text_content = pd.DataFrame(data={"text": []})
33
+ for i, line in enumerate(lines):
34
+ df = pd.DataFrame(data={"text": [line.text]})
35
+ text_content = pd.concat([text_content, df], ignore_index=True)
36
+
37
+ strongs = s.find_all("strong")
38
+ strong_content = pd.DataFrame(data={"text": []})
39
+ for i, strong in enumerate(strongs):
40
+ if i > 0:
41
+ df = pd.DataFrame(data={"text": [strong.text]})
42
+ strong_content = pd.concat([strong_content, df], ignore_index=True)
43
+ # df has content
44
+ df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
45
+ drop=True
46
+ )
47
+
48
+ return df
49
+
50
+
51
+ else:
52
+ print('No New article is found')
53
+
54
+
55
+