File size: 15,121 Bytes

4dfb4e3

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7f8595ab-ad00-435e-9d18-f92b0993609e",
   "metadata": {},
   "source": [
    "## Import of required Modules\n",
    "Make sure, that the imported moduls are installed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "911b1609-a4df-4311-92b5-67b1ac3d3957",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "import time\n",
    "import numpy as np\n",
    "from IPython.display import clear_output \n",
    "import pandas as pd\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b078c40-20cc-46c4-9a09-2947812fcfe0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.chrome.options import Options\n",
    "from selenium.webdriver.chrome.service import Service"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38fc4c60-25aa-4d46-9d40-47b32a7e6c4f",
   "metadata": {},
   "source": [
    "### Settings\n",
    "change the settings to assert the scraper runs on your system. We reccomend to change the wait_time to a higher value if you run the scraper on a slower system or internet connection."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a51084c-dea2-4290-adb5-0e07dfefda02",
   "metadata": {},
   "outputs": [],
   "source": [
    "wait_time = 0.25"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "788d1a88-6955-4e93-a558-54a8488e7697",
   "metadata": {},
   "source": [
    "#### set up Selenium and Chrome Driver \n",
    "We use selenium with Chrome and tested the scraper with the chromedriver. You need the latest version of the driver from https://chromedriver.chromium.org/. Alternatively, change to the driver to a driver of your preferance.\n",
    "We set up the scraper to run in the background, if you wish to run it in regular window mode, remove the line  \"chrome_options.add_argument(\"--headless\")\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94b6234d-8171-4a69-8547-4247ae6d3453",
   "metadata": {},
   "outputs": [],
   "source": [
    "# chromedriver setup\n",
    "\n",
    "serv = Service(r'driver/chromedriver') #path from 'which chromedriver'\n",
    "\n",
    "# test driver\n",
    "# for headless chrome mode\n",
    "chrome_options = Options()\n",
    "\n",
    "# remove this line if you do not wish to run in background \n",
    "chrome_options.add_argument(\"--headless\") "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a153cf31-4c05-4b1c-8699-60ab5630dcd5",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Retrieve Article Links for Crawl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1389d55b-9b73-42a8-83f9-a10b26caa7e5",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# assemble list of links to all articles\n",
    "links = []\n",
    "\n",
    "# interact with cookie terms\n",
    "chrome_path = r'driver/chromedriver' #path from 'which chromedriver'\n",
    "chrome_options = Options()\n",
    "chrome_options.add_argument(\"--headless\") # open chrome in background\n",
    "driver = webdriver.Chrome(service=serv, options=chrome_options)\n",
    "driver.get('https://www.allsides.com/headline-roundups')\n",
    "wait = WebDriverWait(driver, 10)\n",
    "wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'css-47sehv')))\n",
    "\n",
    "ele = driver.find_element(By.CLASS_NAME, \"css-47sehv\")\n",
    "ele.click()\n",
    "\n",
    "# retireve number of pages \n",
    "last_page_button = driver.find_element(By.CLASS_NAME, \"pager-last\")\n",
    "link_last_page = last_page_button.find_elements(By.TAG_NAME, \"a\")\n",
    "t = link_last_page[0].get_attribute(\"href\")\n",
    "last_page_index = int(t[-3:])\n",
    "\n",
    "# retrieve links from start page\n",
    "main_table = driver.find_element(By.XPATH, \"//*[@id=\\\"block-views-de37fa32ea86f5545eb9b7722977a70d\\\"]/div/div[2]/table/tbody\") # table body\n",
    "\n",
    "rows = main_table.find_elements(By.TAG_NAME, \"tr\")\n",
    "for i in rows:\n",
    "    entry = i.find_elements(By.TAG_NAME, \"td\")\n",
    "    link = entry[0].find_element(By.TAG_NAME, \"a\")\n",
    "    links.append(link.get_attribute(\"href\"))\n",
    "WebDriverWait(driver, 20)\n",
    "\n",
    "# retrieve links for other pages\n",
    "for page in tqdm(range(2,last_page_index+1)):   # set to max number of pages\n",
    "    driver.get(\"https://www.allsides.com/headline-roundups?page=\"+str(page))\n",
    "    WebDriverWait(driver, 20)\n",
    "    main_table = driver.find_element(By.XPATH, \"//*[@id=\\\"block-views-de37fa32ea86f5545eb9b7722977a70d\\\"]/div/div[2]/table/tbody\") # table body\n",
    "\n",
    "    rows = main_table.find_elements(By.TAG_NAME, \"tr\")\n",
    "    for i in rows:\n",
    "        entry = i.find_elements(By.TAG_NAME, \"td\")\n",
    "        link = entry[0].find_element(By.TAG_NAME, \"a\")\n",
    "        links.append(link.get_attribute(\"href\"))\n",
    "\n",
    "driver.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d21eb71b-45c5-40df-b103-7bca92a26096",
   "metadata": {},
   "outputs": [],
   "source": [
    "# exports links as pickle file\n",
    "with open(\"linklist_allsides_news.pickle\", \"wb\") as f:\n",
    "    pickle.dump(links, f, pickle.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e39f557-5a25-4159-b7ef-7d1e5d06faa6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# exports links as csv\n",
    "with open(\"linklist_allsides_news.csv\", \"w\") as f:\n",
    "    for line in links:\n",
    "        print(line, file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fde818ff-8d37-4e5e-bd97-b567414ec9c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# functions that assert the existence of \n",
    "def check_exists_by_xpath(xpath):\n",
    "    try:\n",
    "        driver.find_elements(By.XPATH, xpath)[0]\n",
    "    except:\n",
    "        return False\n",
    "    return True\n",
    "\n",
    "def check_exists_by_class(inp):\n",
    "    try:\n",
    "        driver.find_elements(By.CLASS_NAME, inp)[0]\n",
    "    except:\n",
    "        return False\n",
    "    return True"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "411f5734-5415-45ac-ad47-edad75ceeb8f",
   "metadata": {},
   "source": [
    "## Retrieve Articles\n",
    "This section of the crawler retrieves all available news articles from AllSides along with the available information and bias tags."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "068d8b13-8ad6-47e9-acba-94020b6900a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load link list from pickle file\n",
    "with open(\"linklist_allsides_news.pickle\", \"rb\") as f:\n",
    "    links = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4a05fcc-8ba5-45fa-a79a-3da26b3a12c8",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# list for results \n",
    "data = []\n",
    "\n",
    "# retrieve information from articles in list of links\n",
    "for li in tqdm(links):\n",
    "    time.sleep(wait_time)\n",
    "    driver = webdriver.Chrome(service=serv, options=chrome_options)\n",
    "    print(li)\n",
    "    # open URL\n",
    "    driver.get(li)\n",
    "    \n",
    "    # interact with pop-up window\n",
    "    if check_exists_by_class(\"css-47sehv\"):\n",
    "        ele = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((driver.find_element(By.CLASS_NAME, \"css-47sehv\"))))\n",
    "        ele.click()\n",
    "    else:\n",
    "        print(\"no button\")\n",
    "    \n",
    "    # netral title heading \n",
    "    try:\n",
    "        heading = driver.find_element(By.TAG_NAME, \"h1\").text\n",
    "    except:\n",
    "        print(\"no heading found\")\n",
    "    \n",
    "    print(heading)\n",
    "    \n",
    "    # date\n",
    "    try:\n",
    "        date = driver.find_element(By.CLASS_NAME, \"date-display-single\").text\n",
    "    except:\n",
    "        date = \"\"\n",
    "    \n",
    "    # tags\n",
    "    try:\n",
    "        tags = [a.text for a in driver.find_element(By.CLASS_NAME, \"page-tags\").find_elements(By.TAG_NAME, \"a\")]\n",
    "    except:\n",
    "        tags = \"\"\n",
    "        \n",
    "    \n",
    "    # define XPATH inforamtion for article divs\n",
    "    divs = [\"/html/body/div[4]/div/div/div/div[4]/div/div/div/div[1]\", \"/html/body/div[4]/div/div/div/div[4]/div/div/div/div[2]\", \"/html/body/div[4]/div/div/div/div[4]/div/div/div/div[3]\"]\n",
    "    \n",
    "    # access information in article divs\n",
    "    for d in divs:\n",
    "        if check_exists_by_xpath(d):\n",
    "            div = driver.find_elements(By.XPATH, d)[0]\n",
    "\n",
    "            # check heading element to find out left/center/right. The title contains the bias label that we can retrieve from the text here\n",
    "            try:\n",
    "                cat = div.find_element(By.TAG_NAME, \"h3\").text\n",
    "            except: \n",
    "                print(\"no headline found\")\n",
    "                \n",
    "            # retrieve link to original article\n",
    "            try:\n",
    "                link = div.find_element(By.TAG_NAME, \"a\").get_attribute(\"href\")\n",
    "                print(link)\n",
    "            except:\n",
    "                print(\"no link found\")\n",
    "                \n",
    "            # left/center/right are shuffled for each article, some roundups have e.g. only left and right articles. \n",
    "            # Thus, we have look to look each article seperately\n",
    "            time.sleep(0.2)\n",
    "            if \"Left\" in cat:\n",
    "                print(\"left\")\n",
    "                try:\n",
    "                    left_heading = div.find_element(By.CLASS_NAME, \"news-title\").text # heading       \n",
    "                except:\n",
    "                    left_heading = \"\"\n",
    "                    print(\"no headline found\")\n",
    "                try:\n",
    "                    left_source = div.find_element(By.CLASS_NAME, \"source-area\").find_element(By.TAG_NAME, \"span\").text #source\n",
    "                except:\n",
    "                    left_source = \"\"\n",
    "                    print(\"no source found\")\n",
    "                try:\n",
    "                    left_text = div.find_element(By.CLASS_NAME, \"news-body\").find_element(By.CLASS_NAME, \"body-contents\").text # news text body-contents\n",
    "                except:\n",
    "                    left_text = \"\"\n",
    "                    print(\"no text found\")\n",
    "                    \n",
    "                # add the article information\n",
    "                data.append({\"url\":link, \"date\":date, \"title\": heading, \"tags\": tags, \"heading\":left_heading, \"source\": left_source, \"text\": left_text, \"bias_rating\": \"left\"})\n",
    "\n",
    "            elif \"Right\" in cat:\n",
    "                print(\"right\")\n",
    "                try:\n",
    "                    right_heading = div.find_element(By.CLASS_NAME, \"news-title\").text # heading\n",
    "                except:\n",
    "                    right_heading = \"\"\n",
    "                    print(\"no headline found\")\n",
    "                try:\n",
    "                    right_source = div.find_element(By.CLASS_NAME, \"source-area\").find_element(By.TAG_NAME, \"span\").text #source \n",
    "                except:\n",
    "                    right_source = \"\"\n",
    "                    print(\"no source found\")\n",
    "                try:\n",
    "                    right_text = div.find_element(By.CLASS_NAME, \"news-body\").find_element(By.CLASS_NAME, \"body-contents\").text # news text\n",
    "                except:\n",
    "                    right_text = \"\"\n",
    "                    print(\"no text found\")\n",
    "                \n",
    "                # add the article information\n",
    "                data.append({\"url\":link, \"date\":date, \"title\": heading, \"tags\": tags, \"heading\":right_heading, \"source\": right_source, \"text\": right_text, \"bias_rating\": \"right\"}) \n",
    "\n",
    "            else:\n",
    "                print(\"center\")\n",
    "                try:\n",
    "                    center_heading = div.find_element(By.CLASS_NAME, \"news-title\").text # heading\n",
    "                except:\n",
    "                    center_heading = \"\"\n",
    "                    print(\"no headline found\")\n",
    "                try:\n",
    "                    center_source = div.find_element(By.CLASS_NAME, \"source-area\").find_element(By.TAG_NAME, \"span\").text #source \n",
    "                except:\n",
    "                    center_source = \"\"\n",
    "                    print(\"no source found\")\n",
    "                try:\n",
    "                    center_text = div.find_element(By.CLASS_NAME, \"news-body\").find_element(By.CLASS_NAME, \"body-contents\").text # news text\n",
    "                except:\n",
    "                    center_text = \"\"\n",
    "                    print(\"no text found\")\n",
    "                    \n",
    "                # add the article information\n",
    "                data.append({\"url\":link, \"date\":date, \"title\": heading, \"tags\": tags, \"heading\":center_heading, \"source\": center_source, \"text\": center_text, \"bias_rating\": \"center\"})\n",
    "        else:\n",
    "            print(\"div not found\")\n",
    "    \n",
    "    driver.close()\n",
    "    \n",
    "    # clear output\n",
    "    clear_output()\n",
    "    # added a wait here to assert the scraper runs well\n",
    "    time.sleep(wait_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2b06a89-ddec-4572-ba28-0856fbb4d44d",
   "metadata": {},
   "outputs": [],
   "source": [
    "## convert data to dataframe\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# export scraped articles\n",
    "df.to_csv(\"allsides_news_complete.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}