## Import of required Modules
Make sure, that the imported moduls are installed.

In [None]:
import pickle
import re
from tqdm import tqdm
import time
import numpy as np
from IPython.display import clear_output 
import pandas as pd
import time

In [None]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

### Settings
change the settings to assert the scraper runs on your system. We reccomend to change the wait_time to a higher value if you run the scraper on a slower system or internet connection.

In [None]:
wait_time = 0.25

#### set up Selenium and Chrome Driver 
We use selenium with Chrome and tested the scraper with the chromedriver. You need the latest version of the driver from https://chromedriver.chromium.org/. Alternatively, change to the driver to a driver of your preferance.
We set up the scraper to run in the background, if you wish to run it in regular window mode, remove the line "chrome_options.add_argument("--headless")".

In [None]:
# chromedriver setup

serv = Service(r'driver/chromedriver') #path from 'which chromedriver'

# test driver
# for headless chrome mode
chrome_options = Options()

# remove this line if you do not wish to run in background 
chrome_options.add_argument("--headless") 

### Retrieve Article Links for Crawl

In [None]:
# assemble list of links to all articles
links = []

# interact with cookie terms
chrome_path = r'driver/chromedriver' #path from 'which chromedriver'
chrome_options = Options()
chrome_options.add_argument("--headless") # open chrome in background
driver = webdriver.Chrome(service=serv, options=chrome_options)
driver.get('https://www.allsides.com/headline-roundups')
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'css-47sehv')))

ele = driver.find_element(By.CLASS_NAME, "css-47sehv")
ele.click()

# retireve number of pages 
last_page_button = driver.find_element(By.CLASS_NAME, "pager-last")
link_last_page = last_page_button.find_elements(By.TAG_NAME, "a")
t = link_last_page[0].get_attribute("href")
last_page_index = int(t[-3:])

# retrieve links from start page
main_table = driver.find_element(By.XPATH, "//*[@id=\"block-views-de37fa32ea86f5545eb9b7722977a70d\"]/div/div[2]/table/tbody") # table body

rows = main_table.find_elements(By.TAG_NAME, "tr")
for i in rows:
 entry = i.find_elements(By.TAG_NAME, "td")
 link = entry[0].find_element(By.TAG_NAME, "a")
 links.append(link.get_attribute("href"))
WebDriverWait(driver, 20)

# retrieve links for other pages
for page in tqdm(range(2,last_page_index+1)): # set to max number of pages
 driver.get("https://www.allsides.com/headline-roundups?page="+str(page))
 WebDriverWait(driver, 20)
 main_table = driver.find_element(By.XPATH, "//*[@id=\"block-views-de37fa32ea86f5545eb9b7722977a70d\"]/div/div[2]/table/tbody") # table body

 rows = main_table.find_elements(By.TAG_NAME, "tr")
 for i in rows:
 entry = i.find_elements(By.TAG_NAME, "td")
 link = entry[0].find_element(By.TAG_NAME, "a")
 links.append(link.get_attribute("href"))

driver.close()

In [None]:
# exports links as pickle file
with open("linklist_allsides_news.pickle", "wb") as f:
 pickle.dump(links, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# exports links as csv
with open("linklist_allsides_news.csv", "w") as f:
 for line in links:
 print(line, file=f)

In [None]:
# functions that assert the existence of 
def check_exists_by_xpath(xpath):
 try:
 driver.find_elements(By.XPATH, xpath)[0]
 except:
 return False
 return True

def check_exists_by_class(inp):
 try:
 driver.find_elements(By.CLASS_NAME, inp)[0]
 except:
 return False
 return True

## Retrieve Articles
This section of the crawler retrieves all available news articles from AllSides along with the available information and bias tags.

In [None]:
# load link list from pickle file
with open("linklist_allsides_news.pickle", "rb") as f:
 links = pickle.load(f)

In [None]:
# list for results 
data = []

# retrieve information from articles in list of links
for li in tqdm(links):
 time.sleep(wait_time)
 driver = webdriver.Chrome(service=serv, options=chrome_options)
 print(li)
 # open URL
 driver.get(li)
 
 # interact with pop-up window
 if check_exists_by_class("css-47sehv"):
 ele = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((driver.find_element(By.CLASS_NAME, "css-47sehv"))))
 ele.click()
 else:
 print("no button")
 
 # netral title heading 
 try:
 heading = driver.find_element(By.TAG_NAME, "h1").text
 except:
 print("no heading found")
 
 print(heading)
 
 # date
 try:
 date = driver.find_element(By.CLASS_NAME, "date-display-single").text
 except:
 date = ""
 
 # tags
 try:
 tags = [a.text for a in driver.find_element(By.CLASS_NAME, "page-tags").find_elements(By.TAG_NAME, "a")]
 except:
 tags = ""
 
 
 # define XPATH inforamtion for article divs
 divs = ["/html/body/div[4]/div/div/div/div[4]/div/div/div/div[1]", "/html/body/div[4]/div/div/div/div[4]/div/div/div/div[2]", "/html/body/div[4]/div/div/div/div[4]/div/div/div/div[3]"]
 
 # access information in article divs
 for d in divs:
 if check_exists_by_xpath(d):
 div = driver.find_elements(By.XPATH, d)[0]

 # check heading element to find out left/center/right. The title contains the bias label that we can retrieve from the text here
 try:
 cat = div.find_element(By.TAG_NAME, "h3").text
 except: 
 print("no headline found")
 
 # retrieve link to original article
 try:
 link = div.find_element(By.TAG_NAME, "a").get_attribute("href")
 print(link)
 except:
 print("no link found")
 
 # left/center/right are shuffled for each article, some roundups have e.g. only left and right articles. 
 # Thus, we have look to look each article seperately
 time.sleep(0.2)
 if "Left" in cat:
 print("left")
 try:
 left_heading = div.find_element(By.CLASS_NAME, "news-title").text # heading 
 except:
 left_heading = ""
 print("no headline found")
 try:
 left_source = div.find_element(By.CLASS_NAME, "source-area").find_element(By.TAG_NAME, "span").text #source
 except:
 left_source = ""
 print("no source found")
 try:
 left_text = div.find_element(By.CLASS_NAME, "news-body").find_element(By.CLASS_NAME, "body-contents").text # news text body-contents
 except:
 left_text = ""
 print("no text found")
 
 # add the article information
 data.append({"url":link, "date":date, "title": heading, "tags": tags, "heading":left_heading, "source": left_source, "text": left_text, "bias_rating": "left"})

 elif "Right" in cat:
 print("right")
 try:
 right_heading = div.find_element(By.CLASS_NAME, "news-title").text # heading
 except:
 right_heading = ""
 print("no headline found")
 try:
 right_source = div.find_element(By.CLASS_NAME, "source-area").find_element(By.TAG_NAME, "span").text #source 
 except:
 right_source = ""
 print("no source found")
 try:
 right_text = div.find_element(By.CLASS_NAME, "news-body").find_element(By.CLASS_NAME, "body-contents").text # news text
 except:
 right_text = ""
 print("no text found")
 
 # add the article information
 data.append({"url":link, "date":date, "title": heading, "tags": tags, "heading":right_heading, "source": right_source, "text": right_text, "bias_rating": "right"}) 

 else:
 print("center")
 try:
 center_heading = div.find_element(By.CLASS_NAME, "news-title").text # heading
 except:
 center_heading = ""
 print("no headline found")
 try:
 center_source = div.find_element(By.CLASS_NAME, "source-area").find_element(By.TAG_NAME, "span").text #source 
 except:
 center_source = ""
 print("no source found")
 try:
 center_text = div.find_element(By.CLASS_NAME, "news-body").find_element(By.CLASS_NAME, "body-contents").text # news text
 except:
 center_text = ""
 print("no text found")
 
 # add the article information
 data.append({"url":link, "date":date, "title": heading, "tags": tags, "heading":center_heading, "source": center_source, "text": center_text, "bias_rating": "center"})
 else:
 print("div not found")
 
 driver.close()
 
 # clear output
 clear_output()
 # added a wait here to assert the scraper runs well
 time.sleep(wait_time)

In [None]:
## convert data to dataframe
df = pd.DataFrame(data)

# export scraped articles
df.to_csv("allsides_news_complete.csv")