{ "cells": [ { "cell_type": "markdown", "id": "7f8595ab-ad00-435e-9d18-f92b0993609e", "metadata": {}, "source": [ "## Import of required Modules\n", "Make sure, that the imported moduls are installed." ] }, { "cell_type": "code", "execution_count": null, "id": "911b1609-a4df-4311-92b5-67b1ac3d3957", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import re\n", "from tqdm import tqdm\n", "import time\n", "import numpy as np\n", "from IPython.display import clear_output \n", "import pandas as pd\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "8b078c40-20cc-46c4-9a09-2947812fcfe0", "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.chrome.options import Options\n", "from selenium.webdriver.chrome.service import Service" ] }, { "cell_type": "markdown", "id": "38fc4c60-25aa-4d46-9d40-47b32a7e6c4f", "metadata": {}, "source": [ "### Settings\n", "change the settings to assert the scraper runs on your system. We reccomend to change the wait_time to a higher value if you run the scraper on a slower system or internet connection." ] }, { "cell_type": "code", "execution_count": null, "id": "8a51084c-dea2-4290-adb5-0e07dfefda02", "metadata": {}, "outputs": [], "source": [ "wait_time = 0.25" ] }, { "cell_type": "markdown", "id": "788d1a88-6955-4e93-a558-54a8488e7697", "metadata": {}, "source": [ "#### set up Selenium and Chrome Driver \n", "We use selenium with Chrome and tested the scraper with the chromedriver. You need the latest version of the driver from https://chromedriver.chromium.org/. Alternatively, change to the driver to a driver of your preferance.\n", "We set up the scraper to run in the background, if you wish to run it in regular window mode, remove the line \"chrome_options.add_argument(\"--headless\")\"." ] }, { "cell_type": "code", "execution_count": null, "id": "94b6234d-8171-4a69-8547-4247ae6d3453", "metadata": {}, "outputs": [], "source": [ "# chromedriver setup\n", "\n", "serv = Service(r'driver/chromedriver') #path from 'which chromedriver'\n", "\n", "# test driver\n", "# for headless chrome mode\n", "chrome_options = Options()\n", "\n", "# remove this line if you do not wish to run in background \n", "chrome_options.add_argument(\"--headless\") " ] }, { "cell_type": "markdown", "id": "a153cf31-4c05-4b1c-8699-60ab5630dcd5", "metadata": { "tags": [] }, "source": [ "### Retrieve Article Links for Crawl" ] }, { "cell_type": "code", "execution_count": null, "id": "1389d55b-9b73-42a8-83f9-a10b26caa7e5", "metadata": { "tags": [] }, "outputs": [], "source": [ "# assemble list of links to all articles\n", "links = []\n", "\n", "# interact with cookie terms\n", "chrome_path = r'driver/chromedriver' #path from 'which chromedriver'\n", "chrome_options = Options()\n", "chrome_options.add_argument(\"--headless\") # open chrome in background\n", "driver = webdriver.Chrome(service=serv, options=chrome_options)\n", "driver.get('https://www.allsides.com/headline-roundups')\n", "wait = WebDriverWait(driver, 10)\n", "wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'css-47sehv')))\n", "\n", "ele = driver.find_element(By.CLASS_NAME, \"css-47sehv\")\n", "ele.click()\n", "\n", "# retireve number of pages \n", "last_page_button = driver.find_element(By.CLASS_NAME, \"pager-last\")\n", "link_last_page = last_page_button.find_elements(By.TAG_NAME, \"a\")\n", "t = link_last_page[0].get_attribute(\"href\")\n", "last_page_index = int(t[-3:])\n", "\n", "# retrieve links from start page\n", "main_table = driver.find_element(By.XPATH, \"//*[@id=\\\"block-views-de37fa32ea86f5545eb9b7722977a70d\\\"]/div/div[2]/table/tbody\") # table body\n", "\n", "rows = main_table.find_elements(By.TAG_NAME, \"tr\")\n", "for i in rows:\n", " entry = i.find_elements(By.TAG_NAME, \"td\")\n", " link = entry[0].find_element(By.TAG_NAME, \"a\")\n", " links.append(link.get_attribute(\"href\"))\n", "WebDriverWait(driver, 20)\n", "\n", "# retrieve links for other pages\n", "for page in tqdm(range(2,last_page_index+1)): # set to max number of pages\n", " driver.get(\"https://www.allsides.com/headline-roundups?page=\"+str(page))\n", " WebDriverWait(driver, 20)\n", " main_table = driver.find_element(By.XPATH, \"//*[@id=\\\"block-views-de37fa32ea86f5545eb9b7722977a70d\\\"]/div/div[2]/table/tbody\") # table body\n", "\n", " rows = main_table.find_elements(By.TAG_NAME, \"tr\")\n", " for i in rows:\n", " entry = i.find_elements(By.TAG_NAME, \"td\")\n", " link = entry[0].find_element(By.TAG_NAME, \"a\")\n", " links.append(link.get_attribute(\"href\"))\n", "\n", "driver.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "d21eb71b-45c5-40df-b103-7bca92a26096", "metadata": {}, "outputs": [], "source": [ "# exports links as pickle file\n", "with open(\"linklist_allsides_news.pickle\", \"wb\") as f:\n", " pickle.dump(links, f, pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": null, "id": "6e39f557-5a25-4159-b7ef-7d1e5d06faa6", "metadata": {}, "outputs": [], "source": [ "# exports links as csv\n", "with open(\"linklist_allsides_news.csv\", \"w\") as f:\n", " for line in links:\n", " print(line, file=f)" ] }, { "cell_type": "code", "execution_count": null, "id": "fde818ff-8d37-4e5e-bd97-b567414ec9c1", "metadata": {}, "outputs": [], "source": [ "# functions that assert the existence of \n", "def check_exists_by_xpath(xpath):\n", " try:\n", " driver.find_elements(By.XPATH, xpath)[0]\n", " except:\n", " return False\n", " return True\n", "\n", "def check_exists_by_class(inp):\n", " try:\n", " driver.find_elements(By.CLASS_NAME, inp)[0]\n", " except:\n", " return False\n", " return True" ] }, { "cell_type": "markdown", "id": "411f5734-5415-45ac-ad47-edad75ceeb8f", "metadata": {}, "source": [ "## Retrieve Articles\n", "This section of the crawler retrieves all available news articles from AllSides along with the available information and bias tags." ] }, { "cell_type": "code", "execution_count": null, "id": "068d8b13-8ad6-47e9-acba-94020b6900a4", "metadata": {}, "outputs": [], "source": [ "# load link list from pickle file\n", "with open(\"linklist_allsides_news.pickle\", \"rb\") as f:\n", " links = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": null, "id": "c4a05fcc-8ba5-45fa-a79a-3da26b3a12c8", "metadata": { "tags": [] }, "outputs": [], "source": [ "# list for results \n", "data = []\n", "\n", "# retrieve information from articles in list of links\n", "for li in tqdm(links):\n", " time.sleep(wait_time)\n", " driver = webdriver.Chrome(service=serv, options=chrome_options)\n", " print(li)\n", " # open URL\n", " driver.get(li)\n", " \n", " # interact with pop-up window\n", " if check_exists_by_class(\"css-47sehv\"):\n", " ele = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((driver.find_element(By.CLASS_NAME, \"css-47sehv\"))))\n", " ele.click()\n", " else:\n", " print(\"no button\")\n", " \n", " # netral title heading \n", " try:\n", " heading = driver.find_element(By.TAG_NAME, \"h1\").text\n", " except:\n", " print(\"no heading found\")\n", " \n", " print(heading)\n", " \n", " # date\n", " try:\n", " date = driver.find_element(By.CLASS_NAME, \"date-display-single\").text\n", " except:\n", " date = \"\"\n", " \n", " # tags\n", " try:\n", " tags = [a.text for a in driver.find_element(By.CLASS_NAME, \"page-tags\").find_elements(By.TAG_NAME, \"a\")]\n", " except:\n", " tags = \"\"\n", " \n", " \n", " # define XPATH inforamtion for article divs\n", " divs = [\"/html/body/div[4]/div/div/div/div[4]/div/div/div/div[1]\", \"/html/body/div[4]/div/div/div/div[4]/div/div/div/div[2]\", \"/html/body/div[4]/div/div/div/div[4]/div/div/div/div[3]\"]\n", " \n", " # access information in article divs\n", " for d in divs:\n", " if check_exists_by_xpath(d):\n", " div = driver.find_elements(By.XPATH, d)[0]\n", "\n", " # check heading element to find out left/center/right. The title contains the bias label that we can retrieve from the text here\n", " try:\n", " cat = div.find_element(By.TAG_NAME, \"h3\").text\n", " except: \n", " print(\"no headline found\")\n", " \n", " # retrieve link to original article\n", " try:\n", " link = div.find_element(By.TAG_NAME, \"a\").get_attribute(\"href\")\n", " print(link)\n", " except:\n", " print(\"no link found\")\n", " \n", " # left/center/right are shuffled for each article, some roundups have e.g. only left and right articles. \n", " # Thus, we have look to look each article seperately\n", " time.sleep(0.2)\n", " if \"Left\" in cat:\n", " print(\"left\")\n", " try:\n", " left_heading = div.find_element(By.CLASS_NAME, \"news-title\").text # heading \n", " except:\n", " left_heading = \"\"\n", " print(\"no headline found\")\n", " try:\n", " left_source = div.find_element(By.CLASS_NAME, \"source-area\").find_element(By.TAG_NAME, \"span\").text #source\n", " except:\n", " left_source = \"\"\n", " print(\"no source found\")\n", " try:\n", " left_text = div.find_element(By.CLASS_NAME, \"news-body\").find_element(By.CLASS_NAME, \"body-contents\").text # news text body-contents\n", " except:\n", " left_text = \"\"\n", " print(\"no text found\")\n", " \n", " # add the article information\n", " data.append({\"url\":link, \"date\":date, \"title\": heading, \"tags\": tags, \"heading\":left_heading, \"source\": left_source, \"text\": left_text, \"bias_rating\": \"left\"})\n", "\n", " elif \"Right\" in cat:\n", " print(\"right\")\n", " try:\n", " right_heading = div.find_element(By.CLASS_NAME, \"news-title\").text # heading\n", " except:\n", " right_heading = \"\"\n", " print(\"no headline found\")\n", " try:\n", " right_source = div.find_element(By.CLASS_NAME, \"source-area\").find_element(By.TAG_NAME, \"span\").text #source \n", " except:\n", " right_source = \"\"\n", " print(\"no source found\")\n", " try:\n", " right_text = div.find_element(By.CLASS_NAME, \"news-body\").find_element(By.CLASS_NAME, \"body-contents\").text # news text\n", " except:\n", " right_text = \"\"\n", " print(\"no text found\")\n", " \n", " # add the article information\n", " data.append({\"url\":link, \"date\":date, \"title\": heading, \"tags\": tags, \"heading\":right_heading, \"source\": right_source, \"text\": right_text, \"bias_rating\": \"right\"}) \n", "\n", " else:\n", " print(\"center\")\n", " try:\n", " center_heading = div.find_element(By.CLASS_NAME, \"news-title\").text # heading\n", " except:\n", " center_heading = \"\"\n", " print(\"no headline found\")\n", " try:\n", " center_source = div.find_element(By.CLASS_NAME, \"source-area\").find_element(By.TAG_NAME, \"span\").text #source \n", " except:\n", " center_source = \"\"\n", " print(\"no source found\")\n", " try:\n", " center_text = div.find_element(By.CLASS_NAME, \"news-body\").find_element(By.CLASS_NAME, \"body-contents\").text # news text\n", " except:\n", " center_text = \"\"\n", " print(\"no text found\")\n", " \n", " # add the article information\n", " data.append({\"url\":link, \"date\":date, \"title\": heading, \"tags\": tags, \"heading\":center_heading, \"source\": center_source, \"text\": center_text, \"bias_rating\": \"center\"})\n", " else:\n", " print(\"div not found\")\n", " \n", " driver.close()\n", " \n", " # clear output\n", " clear_output()\n", " # added a wait here to assert the scraper runs well\n", " time.sleep(wait_time)" ] }, { "cell_type": "code", "execution_count": null, "id": "d2b06a89-ddec-4572-ba28-0856fbb4d44d", "metadata": {}, "outputs": [], "source": [ "## convert data to dataframe\n", "df = pd.DataFrame(data)\n", "\n", "# export scraped articles\n", "df.to_csv(\"allsides_news_complete.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }