diff --git "a/src/04_use_case/forum/buergergeld_forum.ipynb" "b/src/04_use_case/forum/buergergeld_forum.ipynb" --- "a/src/04_use_case/forum/buergergeld_forum.ipynb" +++ "b/src/04_use_case/forum/buergergeld_forum.ipynb" @@ -1,19 +1,18 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: \"Web Scraping von Foren: Bürgergeld Forum\"\n", + "description: \"Ein Tool zur Extraktion und Analyse von Forenbeiträgen aus dem Bürgergeld Forum, einschließlich der Verarbeitung und Visualisierung der Daten.\"\n", + "author: \"Benjamin\"\n", + "date: \"2024-12-16\"\n", + "date-modified: \"2024-12-16\"\n", + "---" + ] + }, { "cell_type": "code", "execution_count": 186, @@ -65,41 +64,7 @@ }, { "cell_type": "code", - "source": [ - "# prompt: bitte loope über alle html dateien im ordner buergergeld_forum und extrahiere die texte der elemente
  • und
  • und speichere sie in einem pandas dataframe\n", - "\n", - "import pandas as pd\n", - "import os\n", - "from bs4 import BeautifulSoup\n", - "\n", - "def extract_data(directory):\n", - " data = []\n", - " for filename in os.listdir(directory):\n", - " if filename.endswith(\".html\"):\n", - " filepath = os.path.join(directory, filename)\n", - " with open(filepath, \"r\", encoding=\"utf-8\") as file:\n", - " html_content = file.read()\n", - " soup = BeautifulSoup(html_content, \"html.parser\")\n", - "\n", - " # Find all relevant elements on the page\n", - " for subject, stats, last_post in zip(soup.find_all(\"li\", class_=\"columnSubject\"),\n", - " soup.find_all(\"li\", class_=\"columnStats\"),\n", - " soup.find_all(\"li\", class_=\"columnLastPost\")):\n", - " #Extract text from each element, handle potential missing elements gracefully\n", - " subject_text = subject.text.strip() if subject else \"\"\n", - " stats_text = stats.text.strip() if stats else \"\"\n", - " last_post_text = last_post.text.strip() if last_post else \"\"\n", - "\n", - " data.append([subject_text, stats_text, last_post_text])\n", - "\n", - " return pd.DataFrame(data, columns=[\"Subject\", \"Stats\", \"LastPost\"])\n", - "\n", - "\n", - "# Example usage\n", - "directory = \"buergergeld_forum\"\n", - "df = extract_data(directory)\n", - "df" - ], + "execution_count": 187, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -108,53 +73,14 @@ "id": "g66NfdfP_J0m", "outputId": "badb7443-8c3b-404f-ca67-fc5b5eb8500c" }, - "execution_count": 187, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " Subject \\\n", - "0 ALG II Weiterbewilligung - wer soll in die Bed... \n", - "1 ALG II Antrag über EService - übermittelt Antr... \n", - "2 ALG II Antrag abgelehnt ohne Begründung - Anre... \n", - "3 Umzugskosten als Pauschale und Ersteinrichtung... \n", - "4 Falscher Bescheid ALG II\\n4\\n\\n\\n\\n\\n\\n\\n\\n\\n\\... \n", - "... ... \n", - "2825 Aufhebungsvertrag in der Ausbildung.\\n4\\n\\n\\n\\... \n", - "2826 Voraussetzungen für Erstausstattung\\n9\\n\\n\\n\\n... \n", - "2827 Bedarfsgemeinschaft wegen Zuwachs mit Partner ... \n", - "2828 Ein Monat kein Kita Beitrag trotz ALG II\\n3\\n\\... \n", - "2829 Überleitung vom Jugendamt zum Jobcenter?\\n16\\n... \n", - "\n", - " Stats \\\n", - "0 Antworten\\n3\\n\\n\\nZugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t... \n", - "1 Antworten\\n2\\n\\n\\nZugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t... \n", - "2 Antworten\\n30\\n\\n\\nZugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t... \n", - "3 Antworten\\n7\\n\\n\\nZugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t... \n", - "4 Antworten\\n4\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n", - "... ... \n", - "2825 Antworten\\n4\\n\\n\\nZugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t... \n", - "2826 Antworten\\n9\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n", - "2827 Antworten\\n4\\n\\n\\nZugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t... \n", - "2828 Antworten\\n3\\n\\n\\nZugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t... \n", - "2829 Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t... \n", - "\n", - " LastPost \n", - "0 jsn73 \\n3. Oktober 2017 um 19:18 \n", - "1 Corinna \\n2. Oktober 2017 um 17:46 \n", - "2 Celen \\n1. Oktober 2017 um 23:22 \n", - "3 Casa \\n1. Oktober 2017 um 11:54 \n", - "4 Casa \\n30. September 2017 um 18:46 \n", - "... ... \n", - "2825 Bass386 \\n21. Februar 2017 um 13:28 \n", - "2826 Hoppel \\n21. Februar 2017 um 04:27 \n", - "2827 Hoppel \\n20. Februar 2017 um 04:30 \n", - "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 \n", - "2829 Nza Kpl \\n19. Februar 2017 um 05:20 \n", - "\n", - "[2830 rows x 3 columns]" - ], + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, "text/html": [ "\n", "
    \n", @@ -517,44 +443,6 @@ "
    \n", " \n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df", - "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 187 - } - ] - }, - { - "cell_type": "code", - "source": [ - "# prompt: bitte separariere die spalte df.lastpost in nutzername last post und datum last post indem die spalte an der stelle \" \\n\" getrennt wird\n", - "\n", - "import pandas as pd\n", - "\n", - "# Assuming df is your DataFrame and it has a column named 'LastPost'\n", - "# Split the 'LastPost' column at the newline character '\\n'\n", - "df[['User_LastPost', 'Date_LastPost']] = df['LastPost'].str.split('\\n', n=1, expand=True)\n", - "\n", - "# Display the updated DataFrame\n", - "df" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 545 - }, - "id": "Kw53dxdTCjcL", - "outputId": "445b1058-8af5-4d65-d510-32a506125453" - }, - "execution_count": 188, - "outputs": [ - { - "output_type": "execute_result", - "data": { "text/plain": [ " Subject \\\n", "0 ALG II Weiterbewilligung - wer soll in die Bed... \n", @@ -582,34 +470,82 @@ "2828 Antworten\\n3\\n\\n\\nZugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t... \n", "2829 Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t... \n", "\n", - " LastPost User_LastPost \\\n", - "0 jsn73 \\n3. Oktober 2017 um 19:18 jsn73 \n", - "1 Corinna \\n2. Oktober 2017 um 17:46 Corinna \n", - "2 Celen \\n1. Oktober 2017 um 23:22 Celen \n", - "3 Casa \\n1. Oktober 2017 um 11:54 Casa \n", - "4 Casa \\n30. September 2017 um 18:46 Casa \n", - "... ... ... \n", - "2825 Bass386 \\n21. Februar 2017 um 13:28 Bass386 \n", - "2826 Hoppel \\n21. Februar 2017 um 04:27 Hoppel \n", - "2827 Hoppel \\n20. Februar 2017 um 04:30 Hoppel \n", - "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 Phillip.1977 \n", - "2829 Nza Kpl \\n19. Februar 2017 um 05:20 Nza Kpl \n", - "\n", - " Date_LastPost \n", - "0 3. Oktober 2017 um 19:18 \n", - "1 2. Oktober 2017 um 17:46 \n", - "2 1. Oktober 2017 um 23:22 \n", - "3 1. Oktober 2017 um 11:54 \n", - "4 30. September 2017 um 18:46 \n", - "... ... \n", - "2825 21. Februar 2017 um 13:28 \n", - "2826 21. Februar 2017 um 04:27 \n", - "2827 20. Februar 2017 um 04:30 \n", - "2828 19. Februar 2017 um 13:07 \n", - "2829 19. Februar 2017 um 05:20 \n", + " LastPost \n", + "0 jsn73 \\n3. Oktober 2017 um 19:18 \n", + "1 Corinna \\n2. Oktober 2017 um 17:46 \n", + "2 Celen \\n1. Oktober 2017 um 23:22 \n", + "3 Casa \\n1. Oktober 2017 um 11:54 \n", + "4 Casa \\n30. September 2017 um 18:46 \n", + "... ... \n", + "2825 Bass386 \\n21. Februar 2017 um 13:28 \n", + "2826 Hoppel \\n21. Februar 2017 um 04:27 \n", + "2827 Hoppel \\n20. Februar 2017 um 04:30 \n", + "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 \n", + "2829 Nza Kpl \\n19. Februar 2017 um 05:20 \n", "\n", - "[2830 rows x 5 columns]" - ], + "[2830 rows x 3 columns]" + ] + }, + "execution_count": 187, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prompt: bitte loope über alle html dateien im ordner buergergeld_forum und extrahiere die texte der elemente
  • und
  • und speichere sie in einem pandas dataframe\n", + "\n", + "import pandas as pd\n", + "import os\n", + "from bs4 import BeautifulSoup\n", + "\n", + "def extract_data(directory):\n", + " data = []\n", + " for filename in os.listdir(directory):\n", + " if filename.endswith(\".html\"):\n", + " filepath = os.path.join(directory, filename)\n", + " with open(filepath, \"r\", encoding=\"utf-8\") as file:\n", + " html_content = file.read()\n", + " soup = BeautifulSoup(html_content, \"html.parser\")\n", + "\n", + " # Find all relevant elements on the page\n", + " for subject, stats, last_post in zip(soup.find_all(\"li\", class_=\"columnSubject\"),\n", + " soup.find_all(\"li\", class_=\"columnStats\"),\n", + " soup.find_all(\"li\", class_=\"columnLastPost\")):\n", + " #Extract text from each element, handle potential missing elements gracefully\n", + " subject_text = subject.text.strip() if subject else \"\"\n", + " stats_text = stats.text.strip() if stats else \"\"\n", + " last_post_text = last_post.text.strip() if last_post else \"\"\n", + "\n", + " data.append([subject_text, stats_text, last_post_text])\n", + "\n", + " return pd.DataFrame(data, columns=[\"Subject\", \"Stats\", \"LastPost\"])\n", + "\n", + "\n", + "# Example usage\n", + "directory = \"buergergeld_forum\"\n", + "df = extract_data(directory)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 545 + }, + "id": "Kw53dxdTCjcL", + "outputId": "445b1058-8af5-4d65-d510-32a506125453" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, "text/html": [ "\n", "
    \n", @@ -996,41 +932,6 @@ "
    \n", " \n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df", - "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 188 - } - ] - }, - { - "cell_type": "code", - "source": [ - "# prompt: bitte teile die spalte df.Stats in zwei spalten an der stelle des ersten vorkommens von \"\\n\\n\\n\". nenne die erste neue spalte Antworten und die zweite neue Spalte Zugriffe\n", - "\n", - "# Split the 'Stats' column at the first occurrence of \"\\n\\n\\n\"\n", - "df[['Antworten', 'Zugriffe']] = df['Stats'].str.split('\\n\\n\\n', n=1, expand=True)\n", - "\n", - "# Display the updated DataFrame\n", - "df" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "OmnvftgTCiHS", - "outputId": "28abbd8c-75ed-4f1a-94e1-9434f589cd2c" - }, - "execution_count": 189, - "outputs": [ - { - "output_type": "execute_result", - "data": { "text/plain": [ " Subject \\\n", "0 ALG II Weiterbewilligung - wer soll in die Bed... \n", @@ -1071,34 +972,59 @@ "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 Phillip.1977 \n", "2829 Nza Kpl \\n19. Februar 2017 um 05:20 Nza Kpl \n", "\n", - " Date_LastPost Antworten \\\n", - "0 3. Oktober 2017 um 19:18 Antworten\\n3 \n", - "1 2. Oktober 2017 um 17:46 Antworten\\n2 \n", - "2 1. Oktober 2017 um 23:22 Antworten\\n30 \n", - "3 1. Oktober 2017 um 11:54 Antworten\\n7 \n", - "4 30. September 2017 um 18:46 Antworten\\n4 \n", - "... ... ... \n", - "2825 21. Februar 2017 um 13:28 Antworten\\n4 \n", - "2826 21. Februar 2017 um 04:27 Antworten\\n9 \n", - "2827 20. Februar 2017 um 04:30 Antworten\\n4 \n", - "2828 19. Februar 2017 um 13:07 Antworten\\n3 \n", - "2829 19. Februar 2017 um 05:20 Antworten\\n16 \n", - "\n", - " Zugriffe \n", - "0 Zugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n", - "1 Zugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t2 \n", - "2 Zugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t30 \n", - "3 Zugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t7 \n", - "4 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n", - "... ... \n", - "2825 Zugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n", - "2826 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t9 \n", - "2827 Zugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n", - "2828 Zugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n", - "2829 Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16 \n", + " Date_LastPost \n", + "0 3. Oktober 2017 um 19:18 \n", + "1 2. Oktober 2017 um 17:46 \n", + "2 1. Oktober 2017 um 23:22 \n", + "3 1. Oktober 2017 um 11:54 \n", + "4 30. September 2017 um 18:46 \n", + "... ... \n", + "2825 21. Februar 2017 um 13:28 \n", + "2826 21. Februar 2017 um 04:27 \n", + "2827 20. Februar 2017 um 04:30 \n", + "2828 19. Februar 2017 um 13:07 \n", + "2829 19. Februar 2017 um 05:20 \n", "\n", - "[2830 rows x 7 columns]" - ], + "[2830 rows x 5 columns]" + ] + }, + "execution_count": 188, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prompt: bitte separariere die spalte df.lastpost in nutzername last post und datum last post indem die spalte an der stelle \" \\n\" getrennt wird\n", + "\n", + "import pandas as pd\n", + "\n", + "# Assuming df is your DataFrame and it has a column named 'LastPost'\n", + "# Split the 'LastPost' column at the newline character '\\n'\n", + "df[['User_LastPost', 'Date_LastPost']] = df['LastPost'].str.split('\\n', n=1, expand=True)\n", + "\n", + "# Display the updated DataFrame\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "OmnvftgTCiHS", + "outputId": "28abbd8c-75ed-4f1a-94e1-9434f589cd2c" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Antworten\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 37,\n \"samples\": [\n \"Antworten\\n12\",\n \"Antworten\\n18\",\n \"Antworten\\n4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Zugriffe\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Zugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Zugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe", + "variable_name": "df" + }, "text/html": [ "\n", "
    \n", @@ -1509,33 +1435,111 @@ "
    \n", " \n" ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df", - "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Antworten\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 37,\n \"samples\": [\n \"Antworten\\n12\",\n \"Antworten\\n18\",\n \"Antworten\\n4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Zugriffe\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Zugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Zugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } + "text/plain": [ + " Subject \\\n", + "0 ALG II Weiterbewilligung - wer soll in die Bed... \n", + "1 ALG II Antrag über EService - übermittelt Antr... \n", + "2 ALG II Antrag abgelehnt ohne Begründung - Anre... \n", + "3 Umzugskosten als Pauschale und Ersteinrichtung... \n", + "4 Falscher Bescheid ALG II\\n4\\n\\n\\n\\n\\n\\n\\n\\n\\n\\... \n", + "... ... \n", + "2825 Aufhebungsvertrag in der Ausbildung.\\n4\\n\\n\\n\\... \n", + "2826 Voraussetzungen für Erstausstattung\\n9\\n\\n\\n\\n... \n", + "2827 Bedarfsgemeinschaft wegen Zuwachs mit Partner ... \n", + "2828 Ein Monat kein Kita Beitrag trotz ALG II\\n3\\n\\... \n", + "2829 Überleitung vom Jugendamt zum Jobcenter?\\n16\\n... \n", + "\n", + " Stats \\\n", + "0 Antworten\\n3\\n\\n\\nZugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t... \n", + "1 Antworten\\n2\\n\\n\\nZugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t... \n", + "2 Antworten\\n30\\n\\n\\nZugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t... \n", + "3 Antworten\\n7\\n\\n\\nZugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t... \n", + "4 Antworten\\n4\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n", + "... ... \n", + "2825 Antworten\\n4\\n\\n\\nZugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t... \n", + "2826 Antworten\\n9\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n", + "2827 Antworten\\n4\\n\\n\\nZugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t... \n", + "2828 Antworten\\n3\\n\\n\\nZugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t... \n", + "2829 Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t... \n", + "\n", + " LastPost User_LastPost \\\n", + "0 jsn73 \\n3. Oktober 2017 um 19:18 jsn73 \n", + "1 Corinna \\n2. Oktober 2017 um 17:46 Corinna \n", + "2 Celen \\n1. Oktober 2017 um 23:22 Celen \n", + "3 Casa \\n1. Oktober 2017 um 11:54 Casa \n", + "4 Casa \\n30. September 2017 um 18:46 Casa \n", + "... ... ... \n", + "2825 Bass386 \\n21. Februar 2017 um 13:28 Bass386 \n", + "2826 Hoppel \\n21. Februar 2017 um 04:27 Hoppel \n", + "2827 Hoppel \\n20. Februar 2017 um 04:30 Hoppel \n", + "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 Phillip.1977 \n", + "2829 Nza Kpl \\n19. Februar 2017 um 05:20 Nza Kpl \n", + "\n", + " Date_LastPost Antworten \\\n", + "0 3. Oktober 2017 um 19:18 Antworten\\n3 \n", + "1 2. Oktober 2017 um 17:46 Antworten\\n2 \n", + "2 1. Oktober 2017 um 23:22 Antworten\\n30 \n", + "3 1. Oktober 2017 um 11:54 Antworten\\n7 \n", + "4 30. September 2017 um 18:46 Antworten\\n4 \n", + "... ... ... \n", + "2825 21. Februar 2017 um 13:28 Antworten\\n4 \n", + "2826 21. Februar 2017 um 04:27 Antworten\\n9 \n", + "2827 20. Februar 2017 um 04:30 Antworten\\n4 \n", + "2828 19. Februar 2017 um 13:07 Antworten\\n3 \n", + "2829 19. Februar 2017 um 05:20 Antworten\\n16 \n", + "\n", + " Zugriffe \n", + "0 Zugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n", + "1 Zugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t2 \n", + "2 Zugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t30 \n", + "3 Zugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t7 \n", + "4 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n", + "... ... \n", + "2825 Zugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n", + "2826 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t9 \n", + "2827 Zugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n", + "2828 Zugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n", + "2829 Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16 \n", + "\n", + "[2830 rows x 7 columns]" + ] }, + "execution_count": 189, "metadata": {}, - "execution_count": 189 + "output_type": "execute_result" } + ], + "source": [ + "# prompt: bitte teile die spalte df.Stats in zwei spalten an der stelle des ersten vorkommens von \"\\n\\n\\n\". nenne die erste neue spalte Antworten und die zweite neue Spalte Zugriffe\n", + "\n", + "# Split the 'Stats' column at the first occurrence of \"\\n\\n\\n\"\n", + "df[['Antworten', 'Zugriffe']] = df['Stats'].str.split('\\n\\n\\n', n=1, expand=True)\n", + "\n", + "# Display the updated DataFrame\n", + "df" ] }, { "cell_type": "code", + "execution_count": 190, + "metadata": { + "id": "-oyn7b69F07l" + }, + "outputs": [], "source": [ "# prompt: reinige die spalte df.Zugriffe indem nur der inhalt bis zum ersten vorkommen von \"\\n\\n\\n\" behalten wird\n", "\n", "# Clean the 'Zugriffe' column by keeping only the content up to the first occurrence of \"\\n\\n\\n\"\n", "df['Zugriffe'] = df['Zugriffe'].str.split('\\n\\n\\n', n=1).str[0]" - ], - "metadata": { - "id": "-oyn7b69F07l" - }, - "execution_count": 190, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 191, + "metadata": { + "id": "lFQ6HAHWNeGo" + }, + "outputs": [], "source": [ "# prompt: für die spalten df.Antworten und df.Zugriffe splitte jeweils an der Stelle \"\\n\" und behalte jeweils die zweiten part\n", "\n", @@ -1544,93 +1548,81 @@ "\n", "# Split the 'Zugriffe' column at the newline character '\\n' and keep the second part\n", "df['Zugriffe'] = df['Zugriffe'].str.split('\\n').str[1]" - ], - "metadata": { - "id": "lFQ6HAHWNeGo" - }, - "execution_count": 191, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 192, + "metadata": { + "id": "gjCKICJdN6W_" + }, + "outputs": [], "source": [ "# prompt: in der spalte df.Subject trenne am ersten vorkommen von \"\\n\" in zwei separate spalten\n", "\n", "# Split the 'Subject' column at the first occurrence of \"\\n\"\n", "df[['Subject_Part1', 'Subject_Part2']] = df['Subject'].str.split('\\n', n=1, expand=True)" - ], - "metadata": { - "id": "gjCKICJdN6W_" - }, - "execution_count": 192, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 193, + "metadata": { + "id": "GsAN9xwQOO4_" + }, + "outputs": [], "source": [ "# prompt: trenne die spalte df.Subject_Part2 an der stelle \"\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\" in zwei separate spalten\n", "\n", "# Split the 'Subject_Part2' column at the specified string\n", "df[['Subject_Part2_1', 'Subject_Part2_2']] = df['Subject_Part2'].str.split('\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n', n=1, expand=True)" - ], - "metadata": { - "id": "GsAN9xwQOO4_" - }, - "execution_count": 193, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 194, + "metadata": { + "id": "DM7YEzcQOO2o" + }, + "outputs": [], "source": [ "# prompt: trenne die spalte df.Subject_Part2_2 in zwei separate spalten an der stelle \"\\n\\n\\n\"\n", "\n", "# Split the 'Subject_Part2_2' column at the specified string\n", "df[['Subject_Part2_2_1', 'Subject_Part2_2_2']] = df['Subject_Part2_2'].str.split('\\n\\n\\n', n=1, expand=True)" - ], - "metadata": { - "id": "DM7YEzcQOO2o" - }, - "execution_count": 194, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 195, + "metadata": { + "id": "qcszoCCxOO0a" + }, + "outputs": [], "source": [ "# prompt: trenne die beiden spalten df.Subject_Part2_2_1\tund df.Subject_Part2_2_2 jeweils an der stelle \"\\n\" in separate spalten\n", "\n", "# Split the 'Subject_Part2_2_1' and 'Subject_Part2_2_2' columns at the newline character '\\n'\n", "df[['Subject_Part2_2_1_split', 'Subject_Part2_2_1_rest']] = df['Subject_Part2_2_1'].str.split('\\n', n=1, expand=True)\n", "df[['Subject_Part2_2_2_split', 'Subject_Part2_2_2_rest']] = df['Subject_Part2_2_2'].str.split('\\n', n=1, expand=True)" - ], - "metadata": { - "id": "qcszoCCxOO0a" - }, - "execution_count": 195, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 196, + "metadata": { + "id": "HKISyRY8OOyH" + }, + "outputs": [], "source": [ "# prompt: entferne die spalten Stats, LastPost, Subject_Part2, Subject_Part2_2, Subject_Part2_2_1, Subject und Subject_Part2_2_2 aus dataframe df\n", "\n", "df = df.drop(columns=['Stats', 'LastPost', 'Subject_Part2', 'Subject_Part2_2', 'Subject_Part2_2_1', 'Subject', 'Subject_Part2_2_2'])" - ], - "metadata": { - "id": "HKISyRY8OOyH" - }, - "execution_count": 196, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "# prompt: hat die spalte Antworten den gleichen inhalt wie die spalte Subject_Part2_1 ?\n", - "\n", - "# Check if the 'Antworten' column has the same content as the 'Subject_Part2_1' column\n", - "comparison_result = df['Antworten'].equals(df['Subject_Part2_1'])\n", - "\n", - "print(f\"Do the 'Antworten' and 'Subject_Part2_1' columns have the same content?: {comparison_result}\")" - ], + "execution_count": 197, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1638,29 +1630,27 @@ "id": "f7Lso2RQOOv3", "outputId": "ac037d1e-9590-4660-aa3f-b027bddade71" }, - "execution_count": 197, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Do the 'Antworten' and 'Subject_Part2_1' columns have the same content?: False\n" ] } + ], + "source": [ + "# prompt: hat die spalte Antworten den gleichen inhalt wie die spalte Subject_Part2_1 ?\n", + "\n", + "# Check if the 'Antworten' column has the same content as the 'Subject_Part2_1' column\n", + "comparison_result = df['Antworten'].equals(df['Subject_Part2_1'])\n", + "\n", + "print(f\"Do the 'Antworten' and 'Subject_Part2_1' columns have the same content?: {comparison_result}\")" ] }, { "cell_type": "code", - "source": [ - "# prompt: haben die spalten user_lastpost und date_lastpost die gleichen inhalte wie Subject_Part2_2_2_split und Subject_Part2_2_2_rest\n", - "\n", - "# Check if 'user_lastpost' and 'date_lastpost' have the same content as 'Subject_Part2_2_2_split' and 'Subject_Part2_2_2_rest' respectively\n", - "comparison_result_user = df['User_LastPost'].equals(df['Subject_Part2_2_2_split'])\n", - "comparison_result_date = df['Date_LastPost'].equals(df['Subject_Part2_2_2_rest'])\n", - "\n", - "print(f\"Do the 'User_LastPost' and 'Subject_Part2_2_2_split' columns have the same content?: {comparison_result_user}\")\n", - "print(f\"Do the 'Date_LastPost' and 'Subject_Part2_2_2_rest' columns have the same content?: {comparison_result_date}\")" - ], + "execution_count": 198, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1668,20 +1658,34 @@ "id": "NzEuLDBLRQFP", "outputId": "014e8ed7-9ff1-43f1-bb1d-5212710e0af4" }, - "execution_count": 198, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Do the 'User_LastPost' and 'Subject_Part2_2_2_split' columns have the same content?: False\n", "Do the 'Date_LastPost' and 'Subject_Part2_2_2_rest' columns have the same content?: False\n" ] } + ], + "source": [ + "# prompt: haben die spalten user_lastpost und date_lastpost die gleichen inhalte wie Subject_Part2_2_2_split und Subject_Part2_2_2_rest\n", + "\n", + "# Check if 'user_lastpost' and 'date_lastpost' have the same content as 'Subject_Part2_2_2_split' and 'Subject_Part2_2_2_rest' respectively\n", + "comparison_result_user = df['User_LastPost'].equals(df['Subject_Part2_2_2_split'])\n", + "comparison_result_date = df['Date_LastPost'].equals(df['Subject_Part2_2_2_rest'])\n", + "\n", + "print(f\"Do the 'User_LastPost' and 'Subject_Part2_2_2_split' columns have the same content?: {comparison_result_user}\")\n", + "print(f\"Do the 'Date_LastPost' and 'Subject_Part2_2_2_rest' columns have the same content?: {comparison_result_date}\")" ] }, { "cell_type": "code", + "execution_count": 199, + "metadata": { + "id": "uwPeVwC0RcF0" + }, + "outputs": [], "source": [ "# prompt: bitte nenne folgende spalten um im dataframe df : Subject_Part1 in \"Thema\", User_LastPost in \"Nutzer_Letzter_Post\", Date_LastPost in \"Datum_Letzter_Post\", Subject_Part2_1 in \"Antworten2\", Subject_Part2_2_1_split in \"Nutzer\", Subject_Part2_2_1_rest in \"Datum2\", Subject_Part2_2_2_split in \"Nutzer3\", Subject_Part2_2_2_rest in \"Datum3\"\n", "\n", @@ -1695,15 +1699,15 @@ " 'Subject_Part2_2_2_split': 'Nutzer3',\n", " 'Subject_Part2_2_2_rest': 'Datum3'\n", "})" - ], - "metadata": { - "id": "uwPeVwC0RcF0" - }, - "execution_count": 199, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 200, + "metadata": { + "id": "JWyNAmx5SHIG" + }, + "outputs": [], "source": [ "# prompt: bitte ändere die reihenfolge der spalten folgendermaßen: Thema, Nutzer_Letzter_Post, Datum_Letzter_Post, Antworten, Zugriffe, und dann die restlichen spalten\n", "\n", @@ -1718,15 +1722,15 @@ "\n", "# Reorder the DataFrame columns\n", "df = df[final_column_order]" - ], - "metadata": { - "id": "JWyNAmx5SHIG" - }, - "execution_count": 200, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 201, + "metadata": { + "id": "pJMHp6otU0k1" + }, + "outputs": [], "source": [ "# prompt: bitte formatiere die spalte df.Zugriffe von als ganze zahl, beachte die schreibweise mit , als tausendertrenner und k als abkürzung für tausend\n", "\n", @@ -1739,29 +1743,29 @@ "\n", "# Convert the column to integers, handling potential errors\n", "df['Zugriffe'] = pd.to_numeric(df['Zugriffe'], errors='coerce').fillna(0).astype(int)" - ], - "metadata": { - "id": "pJMHp6otU0k1" - }, - "execution_count": 201, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 202, + "metadata": { + "id": "_DRO85t8VOtJ" + }, + "outputs": [], "source": [ "# prompt: bitte ändere die spalte df.Antworten df.Antworten2 in ganze zahl\n", "\n", "# Convert 'Antworten' column to integers, handling errors\n", "df['Antworten2'] = pd.to_numeric(df['Antworten2'], errors='coerce').fillna(0).astype(int)" - ], - "metadata": { - "id": "_DRO85t8VOtJ" - }, - "execution_count": 202, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": 216, + "metadata": { + "id": "bw45sFeDY1l5" + }, + "outputs": [], "source": [ "# prompt: extrahiere zahlen mit jahresformat in der spalte df.Datum_Letzter_Post und speichere sie in der Spalte \"Jahr_Letzter_Post\" und setzt die neue spalte als datentyp ganze zahl\n", "\n", @@ -1773,18 +1777,11 @@ "\n", "# Convert the 'Jahr_Letzter_Post' column to integers\n", "df['Jahr_Letzter_Post'] = pd.to_numeric(df['Jahr_Letzter_Post'], errors='coerce').astype('Int64')" - ], - "metadata": { - "id": "bw45sFeDY1l5" - }, - "execution_count": 216, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "df.info()" - ], + "execution_count": 217, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1792,11 +1789,10 @@ "id": "mz2ENdOQVKAh", "outputId": "6d8b10df-7de8-41b0-f03d-316c95550013" }, - "execution_count": 217, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\n", "RangeIndex: 2830 entries, 0 to 2829\n", @@ -1818,13 +1814,14 @@ "memory usage: 246.1+ KB\n" ] } + ], + "source": [ + "df.info()" ] }, { "cell_type": "code", - "source": [ - "df.T[0]" - ], + "execution_count": 211, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1833,24 +1830,9 @@ "id": "ymlZ9_beVtCh", "outputId": "25791ecd-c223-4cf0-fb69-eba0620fe046" }, - "execution_count": 211, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "Thema ALG II Weiterbewilligung - wer soll in die Bed...\n", - "Nutzer_Letzter_Post jsn73 \n", - "Datum_Letzter_Post 3. Oktober 2017 um 19:18\n", - "Antworten 3\n", - "Zugriffe 87000\n", - "Antworten2 3\n", - "Nutzer2 jsn73\n", - "Datum2 3. Oktober 2017 um 18:08\n", - "Nutzer3 jsn73\n", - "Datum3 3. Oktober 2017 um 19:18\n", - "Name: 0, dtype: object" - ], "text/html": [ "
    \n", "