Spaces:

tracinginsights
/

QuotesBot

Runtime error

App Files Files Community

tracinginsights commited on Dec 22, 2022

Commit

eb67193

1 Parent(s): 5ca1905

Create app.py

Browse files

Files changed (1) hide show

app.py +55 -0

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+import requests
+import isort
+import black
+import flair
+import time
+from bs4 import BeautifulSoup
+URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
+def get_xml(url):
+    # xpath is only for formula1
+    # use urllib.parse to check for formula1.com website or other news
+    xml = pd.read_xml(url,xpath='channel/item')
+def check_updates(every=60):
+    while True:
+        time.sleep(every)
+        latest_xml = get_xml()
+        if ~previous_xml.equals(latest_xml):
+            print('New articles found')
+            new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
+            for article in new_articles_df.iterrows():
+                link = row[1]["guid"]
+                request = requests.get(link)
+                soup = BeautifulSoup(request.content, "html.parser")
+                # class_ below will be different for different websites
+                s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
+                lines = s.find_all("p")
+                text_content = pd.DataFrame(data={"text": []})
+                for i, line in enumerate(lines):
+                    df = pd.DataFrame(data={"text": [line.text]})
+                    text_content = pd.concat([text_content, df], ignore_index=True)
+                strongs = s.find_all("strong")
+                strong_content = pd.DataFrame(data={"text": []})
+                for i, strong in enumerate(strongs):
+                    if i > 0:
+                        df = pd.DataFrame(data={"text": [strong.text]})
+                        strong_content = pd.concat([strong_content, df], ignore_index=True)
+                # df has content
+                df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
+                            drop=True
+                        )
+                return df
+        else:
+            print('No New article is found')