tracinginsights commited on
Commit
a76c1ab
·
1 Parent(s): eb67193

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -25
app.py CHANGED
@@ -5,9 +5,16 @@ import black
5
  import flair
6
  import time
7
  from bs4 import BeautifulSoup
 
 
8
 
 
 
 
9
 
10
 
 
 
11
  URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
12
 
13
  def get_xml(url):
@@ -15,37 +22,122 @@ def get_xml(url):
15
  # use urllib.parse to check for formula1.com website or other news
16
  xml = pd.read_xml(url,xpath='channel/item')
17
 
18
- def check_updates(every=60):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  while True:
20
  time.sleep(every)
21
  latest_xml = get_xml()
22
  if ~previous_xml.equals(latest_xml):
23
  print('New articles found')
24
  new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
25
- for article in new_articles_df.iterrows():
26
- link = row[1]["guid"]
27
- request = requests.get(link)
28
- soup = BeautifulSoup(request.content, "html.parser")
29
- # class_ below will be different for different websites
30
- s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
31
- lines = s.find_all("p")
32
- text_content = pd.DataFrame(data={"text": []})
33
- for i, line in enumerate(lines):
34
- df = pd.DataFrame(data={"text": [line.text]})
35
- text_content = pd.concat([text_content, df], ignore_index=True)
36
-
37
- strongs = s.find_all("strong")
38
- strong_content = pd.DataFrame(data={"text": []})
39
- for i, strong in enumerate(strongs):
40
- if i > 0:
41
- df = pd.DataFrame(data={"text": [strong.text]})
42
- strong_content = pd.concat([strong_content, df], ignore_index=True)
43
- # df has content
44
- df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
45
- drop=True
46
- )
47
-
48
- return df
49
 
50
 
51
  else:
 
5
  import flair
6
  import time
7
  from bs4 import BeautifulSoup
8
+ import re
9
+ import numpy as np
10
 
11
+ from flair.data import Sentence
12
+ from flair.models import SequenceTagger
13
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
14
 
15
 
16
+ import string
17
+
18
  URL = "https://www.formula1.com/content/fom-website/en/latest/all.xml"
19
 
20
  def get_xml(url):
 
22
  # use urllib.parse to check for formula1.com website or other news
23
  xml = pd.read_xml(url,xpath='channel/item')
24
 
25
+
26
+
27
+ # care taken to only consider results where there are more words not a single word quotes
28
+ def extract_quote(string):
29
+ # Use the re.findall function to extract the quoted text
30
+ results = re.findall(r'[“\"](.*?)[”\"]', string)
31
+ quotes = []
32
+ for result in results:
33
+ split_result = result.split()
34
+ if len(split_result) >3:
35
+ quotes.append(result)
36
+
37
+ return quotes
38
+
39
+
40
+
41
+ def get_names(text):
42
+ # # load the NER tagger
43
+ tagger = SequenceTagger.load('ner')
44
+
45
+ sentence = Sentence(text)
46
+ tagger.predict(sentence)
47
+
48
+ names = []
49
+ for label in sentence.get_labels('ner'):
50
+ if label.value == "PER":
51
+ names.append(f"{label.data_point.text}")
52
+
53
+ # convert to a set to remove some of the repetitions
54
+ names = list(set(names))
55
+
56
+ return names
57
+
58
+ def get_text(new_articles_df):
59
+ """
60
+ quotes outputs a list of quotes
61
+ """
62
+
63
+ dfs_dict = {}
64
+
65
+ for article in tqdm(new_articles_df.iterrows()):
66
+
67
+ link = article[1]["guid"]
68
+ request = requests.get(link)
69
+ soup = BeautifulSoup(request.content, "html.parser")
70
+ # class_ below will be different for different websites
71
+ s = soup.find("div", class_="col-lg-8 col-xl-7 offset-xl-1 f1-article--content")
72
+ lines = s.find_all("p")
73
+ text_content = pd.DataFrame(data={"text": []})
74
+ for i, line in enumerate(lines):
75
+ df = pd.DataFrame(data={"text": [line.text]})
76
+ text_content = pd.concat([text_content, df], ignore_index=True)
77
+
78
+ strongs = s.find_all("strong")
79
+ strong_content = pd.DataFrame(data={"text": []})
80
+ for i, strong in enumerate(strongs):
81
+ if i > 0:
82
+ df = pd.DataFrame(data={"text": [strong.text]})
83
+ strong_content = pd.concat([strong_content, df], ignore_index=True)
84
+ # df has content
85
+ df = text_content[~text_content["text"].isin(strong_content["text"])].reset_index(
86
+ drop=True
87
+ )
88
+ # df["quote"] = df["text"].apply(lambda row: extract_quote(row))
89
+ # # combine all rows into context
90
+
91
+ context = ""
92
+
93
+ for i,row in df.iterrows():
94
+ context += f" {row['text']}"
95
+
96
+
97
+ quotes = extract_quote(context)
98
+ # to save some time not computing unnecessary NER
99
+ if len(quotes) != 0:
100
+ speakers = get_names(context)
101
+ else:
102
+ speakers = ()
103
+
104
+ dfs_dict[link] = {'context':context, 'quotes':quotes, 'speakers':speakers}
105
+
106
+ return dfs_dict
107
+
108
+ def load_speaker_model():
109
+
110
+ model_name = f"microsoft/deberta-v2-large"
111
+
112
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
113
+
114
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
115
+
116
+ question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
117
+
118
+ return question_answerer
119
+
120
+
121
+
122
+ def remove_punctuations(text):
123
+
124
+ modified_text = "".join([character for character in text if character not in string.punctuation])
125
+ modified_text = modified_text.lstrip(" ")
126
+ modified_text = modified_text.rstrip(" ")
127
+
128
+ return modified_text
129
+
130
+
131
+ def check_updates(every=300):
132
  while True:
133
  time.sleep(every)
134
  latest_xml = get_xml()
135
  if ~previous_xml.equals(latest_xml):
136
  print('New articles found')
137
  new_articles_df = latest_xml[~latest_xml["guid"].isin(previous_xml["guid"])]
138
+
139
+ # loops through new articles and gets the necessary text, quotes and speakers
140
+ dfs_dict = get_text(new_articles_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
 
143
  else: