Spaces:
Runtime error
Runtime error
Commit
·
eb67193
1
Parent(s):
5ca1905
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import requests
|
3 |
+
import isort
|
4 |
+
import black
|
5 |
+
import flair
|
6 |
+
import time
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
|
12 |
+
|
13 |
+
def get_xml(url):
|
14 |
+
# xpath is only for formula1
|
15 |
+
# use urllib.parse to check for formula1.com website or other news
|
16 |
+
xml = pd.read_xml(url,xpath='channel/item')
|
17 |
+
|
18 |
+
def check_updates(every=60):
|
19 |
+
while True:
|
20 |
+
time.sleep(every)
|
21 |
+
latest_xml = get_xml()
|
22 |
+
if ~previous_xml.equals(latest_xml):
|
23 |
+
print('New articles found')
|
24 |
+
new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
|
25 |
+
for article in new_articles_df.iterrows():
|
26 |
+
link = row[1]["guid"]
|
27 |
+
request = requests.get(link)
|
28 |
+
soup = BeautifulSoup(request.content, "html.parser")
|
29 |
+
# class_ below will be different for different websites
|
30 |
+
s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
|
31 |
+
lines = s.find_all("p")
|
32 |
+
text_content = pd.DataFrame(data={"text": []})
|
33 |
+
for i, line in enumerate(lines):
|
34 |
+
df = pd.DataFrame(data={"text": [line.text]})
|
35 |
+
text_content = pd.concat([text_content, df], ignore_index=True)
|
36 |
+
|
37 |
+
strongs = s.find_all("strong")
|
38 |
+
strong_content = pd.DataFrame(data={"text": []})
|
39 |
+
for i, strong in enumerate(strongs):
|
40 |
+
if i > 0:
|
41 |
+
df = pd.DataFrame(data={"text": [strong.text]})
|
42 |
+
strong_content = pd.concat([strong_content, df], ignore_index=True)
|
43 |
+
# df has content
|
44 |
+
df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
|
45 |
+
drop=True
|
46 |
+
)
|
47 |
+
|
48 |
+
return df
|
49 |
+
|
50 |
+
|
51 |
+
else:
|
52 |
+
print('No New article is found')
|
53 |
+
|
54 |
+
|
55 |
+
|