diff --git "a/src/04_use_case/forum/buergergeld_forum.ipynb" "b/src/04_use_case/forum/buergergeld_forum.ipynb"
--- "a/src/04_use_case/forum/buergergeld_forum.ipynb"
+++ "b/src/04_use_case/forum/buergergeld_forum.ipynb"
@@ -1,19 +1,18 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "provenance": []
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
- }
- },
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "title: \"Web Scraping von Foren: Bürgergeld Forum\"\n",
+ "description: \"Ein Tool zur Extraktion und Analyse von Forenbeiträgen aus dem Bürgergeld Forum, einschließlich der Verarbeitung und Visualisierung der Daten.\"\n",
+ "author: \"Benjamin\"\n",
+ "date: \"2024-12-16\"\n",
+ "date-modified: \"2024-12-16\"\n",
+ "---"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 186,
@@ -65,41 +64,7 @@
},
{
"cell_type": "code",
- "source": [
- "# prompt: bitte loope über alle html dateien im ordner buergergeld_forum und extrahiere die texte der elemente
und und speichere sie in einem pandas dataframe\n",
- "\n",
- "import pandas as pd\n",
- "import os\n",
- "from bs4 import BeautifulSoup\n",
- "\n",
- "def extract_data(directory):\n",
- " data = []\n",
- " for filename in os.listdir(directory):\n",
- " if filename.endswith(\".html\"):\n",
- " filepath = os.path.join(directory, filename)\n",
- " with open(filepath, \"r\", encoding=\"utf-8\") as file:\n",
- " html_content = file.read()\n",
- " soup = BeautifulSoup(html_content, \"html.parser\")\n",
- "\n",
- " # Find all relevant elements on the page\n",
- " for subject, stats, last_post in zip(soup.find_all(\"li\", class_=\"columnSubject\"),\n",
- " soup.find_all(\"li\", class_=\"columnStats\"),\n",
- " soup.find_all(\"li\", class_=\"columnLastPost\")):\n",
- " #Extract text from each element, handle potential missing elements gracefully\n",
- " subject_text = subject.text.strip() if subject else \"\"\n",
- " stats_text = stats.text.strip() if stats else \"\"\n",
- " last_post_text = last_post.text.strip() if last_post else \"\"\n",
- "\n",
- " data.append([subject_text, stats_text, last_post_text])\n",
- "\n",
- " return pd.DataFrame(data, columns=[\"Subject\", \"Stats\", \"LastPost\"])\n",
- "\n",
- "\n",
- "# Example usage\n",
- "directory = \"buergergeld_forum\"\n",
- "df = extract_data(directory)\n",
- "df"
- ],
+ "execution_count": 187,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -108,53 +73,14 @@
"id": "g66NfdfP_J0m",
"outputId": "badb7443-8c3b-404f-ca67-fc5b5eb8500c"
},
- "execution_count": 187,
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " Subject \\\n",
- "0 ALG II Weiterbewilligung - wer soll in die Bed... \n",
- "1 ALG II Antrag über EService - übermittelt Antr... \n",
- "2 ALG II Antrag abgelehnt ohne Begründung - Anre... \n",
- "3 Umzugskosten als Pauschale und Ersteinrichtung... \n",
- "4 Falscher Bescheid ALG II\\n4\\n\\n\\n\\n\\n\\n\\n\\n\\n\\... \n",
- "... ... \n",
- "2825 Aufhebungsvertrag in der Ausbildung.\\n4\\n\\n\\n\\... \n",
- "2826 Voraussetzungen für Erstausstattung\\n9\\n\\n\\n\\n... \n",
- "2827 Bedarfsgemeinschaft wegen Zuwachs mit Partner ... \n",
- "2828 Ein Monat kein Kita Beitrag trotz ALG II\\n3\\n\\... \n",
- "2829 Überleitung vom Jugendamt zum Jobcenter?\\n16\\n... \n",
- "\n",
- " Stats \\\n",
- "0 Antworten\\n3\\n\\n\\nZugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t... \n",
- "1 Antworten\\n2\\n\\n\\nZugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t... \n",
- "2 Antworten\\n30\\n\\n\\nZugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t... \n",
- "3 Antworten\\n7\\n\\n\\nZugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t... \n",
- "4 Antworten\\n4\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n",
- "... ... \n",
- "2825 Antworten\\n4\\n\\n\\nZugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t... \n",
- "2826 Antworten\\n9\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n",
- "2827 Antworten\\n4\\n\\n\\nZugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t... \n",
- "2828 Antworten\\n3\\n\\n\\nZugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t... \n",
- "2829 Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t... \n",
- "\n",
- " LastPost \n",
- "0 jsn73 \\n3. Oktober 2017 um 19:18 \n",
- "1 Corinna \\n2. Oktober 2017 um 17:46 \n",
- "2 Celen \\n1. Oktober 2017 um 23:22 \n",
- "3 Casa \\n1. Oktober 2017 um 11:54 \n",
- "4 Casa \\n30. September 2017 um 18:46 \n",
- "... ... \n",
- "2825 Bass386 \\n21. Februar 2017 um 13:28 \n",
- "2826 Hoppel \\n21. Februar 2017 um 04:27 \n",
- "2827 Hoppel \\n20. Februar 2017 um 04:30 \n",
- "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 \n",
- "2829 Nza Kpl \\n19. Februar 2017 um 05:20 \n",
- "\n",
- "[2830 rows x 3 columns]"
- ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "df"
+ },
"text/html": [
"\n",
" \n",
@@ -517,44 +443,6 @@
"
\n",
" \n"
],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "df",
- "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
- }
- },
- "metadata": {},
- "execution_count": 187
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# prompt: bitte separariere die spalte df.lastpost in nutzername last post und datum last post indem die spalte an der stelle \" \\n\" getrennt wird\n",
- "\n",
- "import pandas as pd\n",
- "\n",
- "# Assuming df is your DataFrame and it has a column named 'LastPost'\n",
- "# Split the 'LastPost' column at the newline character '\\n'\n",
- "df[['User_LastPost', 'Date_LastPost']] = df['LastPost'].str.split('\\n', n=1, expand=True)\n",
- "\n",
- "# Display the updated DataFrame\n",
- "df"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 545
- },
- "id": "Kw53dxdTCjcL",
- "outputId": "445b1058-8af5-4d65-d510-32a506125453"
- },
- "execution_count": 188,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
"text/plain": [
" Subject \\\n",
"0 ALG II Weiterbewilligung - wer soll in die Bed... \n",
@@ -582,34 +470,82 @@
"2828 Antworten\\n3\\n\\n\\nZugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t... \n",
"2829 Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t... \n",
"\n",
- " LastPost User_LastPost \\\n",
- "0 jsn73 \\n3. Oktober 2017 um 19:18 jsn73 \n",
- "1 Corinna \\n2. Oktober 2017 um 17:46 Corinna \n",
- "2 Celen \\n1. Oktober 2017 um 23:22 Celen \n",
- "3 Casa \\n1. Oktober 2017 um 11:54 Casa \n",
- "4 Casa \\n30. September 2017 um 18:46 Casa \n",
- "... ... ... \n",
- "2825 Bass386 \\n21. Februar 2017 um 13:28 Bass386 \n",
- "2826 Hoppel \\n21. Februar 2017 um 04:27 Hoppel \n",
- "2827 Hoppel \\n20. Februar 2017 um 04:30 Hoppel \n",
- "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 Phillip.1977 \n",
- "2829 Nza Kpl \\n19. Februar 2017 um 05:20 Nza Kpl \n",
- "\n",
- " Date_LastPost \n",
- "0 3. Oktober 2017 um 19:18 \n",
- "1 2. Oktober 2017 um 17:46 \n",
- "2 1. Oktober 2017 um 23:22 \n",
- "3 1. Oktober 2017 um 11:54 \n",
- "4 30. September 2017 um 18:46 \n",
- "... ... \n",
- "2825 21. Februar 2017 um 13:28 \n",
- "2826 21. Februar 2017 um 04:27 \n",
- "2827 20. Februar 2017 um 04:30 \n",
- "2828 19. Februar 2017 um 13:07 \n",
- "2829 19. Februar 2017 um 05:20 \n",
+ " LastPost \n",
+ "0 jsn73 \\n3. Oktober 2017 um 19:18 \n",
+ "1 Corinna \\n2. Oktober 2017 um 17:46 \n",
+ "2 Celen \\n1. Oktober 2017 um 23:22 \n",
+ "3 Casa \\n1. Oktober 2017 um 11:54 \n",
+ "4 Casa \\n30. September 2017 um 18:46 \n",
+ "... ... \n",
+ "2825 Bass386 \\n21. Februar 2017 um 13:28 \n",
+ "2826 Hoppel \\n21. Februar 2017 um 04:27 \n",
+ "2827 Hoppel \\n20. Februar 2017 um 04:30 \n",
+ "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 \n",
+ "2829 Nza Kpl \\n19. Februar 2017 um 05:20 \n",
"\n",
- "[2830 rows x 5 columns]"
- ],
+ "[2830 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 187,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# prompt: bitte loope über alle html dateien im ordner buergergeld_forum und extrahiere die texte der elemente und und speichere sie in einem pandas dataframe\n",
+ "\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "from bs4 import BeautifulSoup\n",
+ "\n",
+ "def extract_data(directory):\n",
+ " data = []\n",
+ " for filename in os.listdir(directory):\n",
+ " if filename.endswith(\".html\"):\n",
+ " filepath = os.path.join(directory, filename)\n",
+ " with open(filepath, \"r\", encoding=\"utf-8\") as file:\n",
+ " html_content = file.read()\n",
+ " soup = BeautifulSoup(html_content, \"html.parser\")\n",
+ "\n",
+ " # Find all relevant elements on the page\n",
+ " for subject, stats, last_post in zip(soup.find_all(\"li\", class_=\"columnSubject\"),\n",
+ " soup.find_all(\"li\", class_=\"columnStats\"),\n",
+ " soup.find_all(\"li\", class_=\"columnLastPost\")):\n",
+ " #Extract text from each element, handle potential missing elements gracefully\n",
+ " subject_text = subject.text.strip() if subject else \"\"\n",
+ " stats_text = stats.text.strip() if stats else \"\"\n",
+ " last_post_text = last_post.text.strip() if last_post else \"\"\n",
+ "\n",
+ " data.append([subject_text, stats_text, last_post_text])\n",
+ "\n",
+ " return pd.DataFrame(data, columns=[\"Subject\", \"Stats\", \"LastPost\"])\n",
+ "\n",
+ "\n",
+ "# Example usage\n",
+ "directory = \"buergergeld_forum\"\n",
+ "df = extract_data(directory)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 545
+ },
+ "id": "Kw53dxdTCjcL",
+ "outputId": "445b1058-8af5-4d65-d510-32a506125453"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "df"
+ },
"text/html": [
"\n",
" \n",
@@ -996,41 +932,6 @@
"
\n",
" \n"
],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "df",
- "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
- }
- },
- "metadata": {},
- "execution_count": 188
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# prompt: bitte teile die spalte df.Stats in zwei spalten an der stelle des ersten vorkommens von \"\\n\\n\\n\". nenne die erste neue spalte Antworten und die zweite neue Spalte Zugriffe\n",
- "\n",
- "# Split the 'Stats' column at the first occurrence of \"\\n\\n\\n\"\n",
- "df[['Antworten', 'Zugriffe']] = df['Stats'].str.split('\\n\\n\\n', n=1, expand=True)\n",
- "\n",
- "# Display the updated DataFrame\n",
- "df"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "id": "OmnvftgTCiHS",
- "outputId": "28abbd8c-75ed-4f1a-94e1-9434f589cd2c"
- },
- "execution_count": 189,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
"text/plain": [
" Subject \\\n",
"0 ALG II Weiterbewilligung - wer soll in die Bed... \n",
@@ -1071,34 +972,59 @@
"2828 Phillip.1977 \\n19. Februar 2017 um 13:07 Phillip.1977 \n",
"2829 Nza Kpl \\n19. Februar 2017 um 05:20 Nza Kpl \n",
"\n",
- " Date_LastPost Antworten \\\n",
- "0 3. Oktober 2017 um 19:18 Antworten\\n3 \n",
- "1 2. Oktober 2017 um 17:46 Antworten\\n2 \n",
- "2 1. Oktober 2017 um 23:22 Antworten\\n30 \n",
- "3 1. Oktober 2017 um 11:54 Antworten\\n7 \n",
- "4 30. September 2017 um 18:46 Antworten\\n4 \n",
- "... ... ... \n",
- "2825 21. Februar 2017 um 13:28 Antworten\\n4 \n",
- "2826 21. Februar 2017 um 04:27 Antworten\\n9 \n",
- "2827 20. Februar 2017 um 04:30 Antworten\\n4 \n",
- "2828 19. Februar 2017 um 13:07 Antworten\\n3 \n",
- "2829 19. Februar 2017 um 05:20 Antworten\\n16 \n",
- "\n",
- " Zugriffe \n",
- "0 Zugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n",
- "1 Zugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t2 \n",
- "2 Zugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t30 \n",
- "3 Zugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t7 \n",
- "4 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n",
- "... ... \n",
- "2825 Zugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n",
- "2826 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t9 \n",
- "2827 Zugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n",
- "2828 Zugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n",
- "2829 Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16 \n",
+ " Date_LastPost \n",
+ "0 3. Oktober 2017 um 19:18 \n",
+ "1 2. Oktober 2017 um 17:46 \n",
+ "2 1. Oktober 2017 um 23:22 \n",
+ "3 1. Oktober 2017 um 11:54 \n",
+ "4 30. September 2017 um 18:46 \n",
+ "... ... \n",
+ "2825 21. Februar 2017 um 13:28 \n",
+ "2826 21. Februar 2017 um 04:27 \n",
+ "2827 20. Februar 2017 um 04:30 \n",
+ "2828 19. Februar 2017 um 13:07 \n",
+ "2829 19. Februar 2017 um 05:20 \n",
"\n",
- "[2830 rows x 7 columns]"
- ],
+ "[2830 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 188,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# prompt: bitte separariere die spalte df.lastpost in nutzername last post und datum last post indem die spalte an der stelle \" \\n\" getrennt wird\n",
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Assuming df is your DataFrame and it has a column named 'LastPost'\n",
+ "# Split the 'LastPost' column at the newline character '\\n'\n",
+ "df[['User_LastPost', 'Date_LastPost']] = df['LastPost'].str.split('\\n', n=1, expand=True)\n",
+ "\n",
+ "# Display the updated DataFrame\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 189,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "OmnvftgTCiHS",
+ "outputId": "28abbd8c-75ed-4f1a-94e1-9434f589cd2c"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Antworten\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 37,\n \"samples\": [\n \"Antworten\\n12\",\n \"Antworten\\n18\",\n \"Antworten\\n4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Zugriffe\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Zugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Zugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
+ "type": "dataframe",
+ "variable_name": "df"
+ },
"text/html": [
"\n",
" \n",
@@ -1509,33 +1435,111 @@
"
\n",
" \n"
],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "df",
- "summary": "{\n \"name\": \"df\",\n \"rows\": 2830,\n \"fields\": [\n {\n \"column\": \"Subject\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2830,\n \"samples\": [\n \"Erh\\u00e4lt EU-Ausl\\u00e4nderin Unterst\\u00fctzung\\n7\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nlumo\\n4. Januar 2011 um 22:03\\n\\n\\nlumo\\n28. M\\u00e4rz 2011 um 21:43\",\n \"Zurueck nach Deutschland\\n1\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSonja28\\n6. September 2015 um 12:32\\n\\n\\nSonja28\\n6. September 2015 um 15:11\",\n \"Hilfe Mittellosigkeit\\n10\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nchorymajster\\n30. Januar 2018 um 17:40\\n\\n\\nchorymajster\\n30. Januar 2018 um 19:51\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Stats\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Antworten\\n8\\n\\n\\nZugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Antworten\\n8\\n\\n\\nZugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2690,\n \"samples\": [\n \"Occa666 \\n31. August 2015 um 12:57\",\n \"Casa \\n20. Juni 2018 um 20:32\",\n \"Hoppel \\n1. Juli 2013 um 05:18\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"User_LastPost\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 631,\n \"samples\": [\n \"chorymajster \",\n \"sophie145 \",\n \"anitram \"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Date_LastPost\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2687,\n \"samples\": [\n \"5. M\\u00e4rz 2021 um 17:22\",\n \"6. Dezember 2019 um 16:21\",\n \"6. Juli 2013 um 11:26\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Antworten\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 37,\n \"samples\": [\n \"Antworten\\n12\",\n \"Antworten\\n18\",\n \"Antworten\\n4\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Zugriffe\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1016,\n \"samples\": [\n \"Zugriffe\\n4,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\",\n \"Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16\",\n \"Zugriffe\\n3,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t8\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
- }
+ "text/plain": [
+ " Subject \\\n",
+ "0 ALG II Weiterbewilligung - wer soll in die Bed... \n",
+ "1 ALG II Antrag über EService - übermittelt Antr... \n",
+ "2 ALG II Antrag abgelehnt ohne Begründung - Anre... \n",
+ "3 Umzugskosten als Pauschale und Ersteinrichtung... \n",
+ "4 Falscher Bescheid ALG II\\n4\\n\\n\\n\\n\\n\\n\\n\\n\\n\\... \n",
+ "... ... \n",
+ "2825 Aufhebungsvertrag in der Ausbildung.\\n4\\n\\n\\n\\... \n",
+ "2826 Voraussetzungen für Erstausstattung\\n9\\n\\n\\n\\n... \n",
+ "2827 Bedarfsgemeinschaft wegen Zuwachs mit Partner ... \n",
+ "2828 Ein Monat kein Kita Beitrag trotz ALG II\\n3\\n\\... \n",
+ "2829 Überleitung vom Jugendamt zum Jobcenter?\\n16\\n... \n",
+ "\n",
+ " Stats \\\n",
+ "0 Antworten\\n3\\n\\n\\nZugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "1 Antworten\\n2\\n\\n\\nZugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "2 Antworten\\n30\\n\\n\\nZugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "3 Antworten\\n7\\n\\n\\nZugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "4 Antworten\\n4\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n",
+ "... ... \n",
+ "2825 Antworten\\n4\\n\\n\\nZugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t... \n",
+ "2826 Antworten\\n9\\n\\n\\nZugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\... \n",
+ "2827 Antworten\\n4\\n\\n\\nZugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "2828 Antworten\\n3\\n\\n\\nZugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "2829 Antworten\\n16\\n\\n\\nZugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t... \n",
+ "\n",
+ " LastPost User_LastPost \\\n",
+ "0 jsn73 \\n3. Oktober 2017 um 19:18 jsn73 \n",
+ "1 Corinna \\n2. Oktober 2017 um 17:46 Corinna \n",
+ "2 Celen \\n1. Oktober 2017 um 23:22 Celen \n",
+ "3 Casa \\n1. Oktober 2017 um 11:54 Casa \n",
+ "4 Casa \\n30. September 2017 um 18:46 Casa \n",
+ "... ... ... \n",
+ "2825 Bass386 \\n21. Februar 2017 um 13:28 Bass386 \n",
+ "2826 Hoppel \\n21. Februar 2017 um 04:27 Hoppel \n",
+ "2827 Hoppel \\n20. Februar 2017 um 04:30 Hoppel \n",
+ "2828 Phillip.1977 \\n19. Februar 2017 um 13:07 Phillip.1977 \n",
+ "2829 Nza Kpl \\n19. Februar 2017 um 05:20 Nza Kpl \n",
+ "\n",
+ " Date_LastPost Antworten \\\n",
+ "0 3. Oktober 2017 um 19:18 Antworten\\n3 \n",
+ "1 2. Oktober 2017 um 17:46 Antworten\\n2 \n",
+ "2 1. Oktober 2017 um 23:22 Antworten\\n30 \n",
+ "3 1. Oktober 2017 um 11:54 Antworten\\n7 \n",
+ "4 30. September 2017 um 18:46 Antworten\\n4 \n",
+ "... ... ... \n",
+ "2825 21. Februar 2017 um 13:28 Antworten\\n4 \n",
+ "2826 21. Februar 2017 um 04:27 Antworten\\n9 \n",
+ "2827 20. Februar 2017 um 04:30 Antworten\\n4 \n",
+ "2828 19. Februar 2017 um 13:07 Antworten\\n3 \n",
+ "2829 19. Februar 2017 um 05:20 Antworten\\n16 \n",
+ "\n",
+ " Zugriffe \n",
+ "0 Zugriffe\\n8,7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n",
+ "1 Zugriffe\\n6,4k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t2 \n",
+ "2 Zugriffe\\n33k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t30 \n",
+ "3 Zugriffe\\n7,6k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t7 \n",
+ "4 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n",
+ "... ... \n",
+ "2825 Zugriffe\\n7k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n",
+ "2826 Zugriffe\\n12k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t9 \n",
+ "2827 Zugriffe\\n5,8k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t4 \n",
+ "2828 Zugriffe\\n5,3k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t3 \n",
+ "2829 Zugriffe\\n18k\\n\\n\\n\\n\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t16 \n",
+ "\n",
+ "[2830 rows x 7 columns]"
+ ]
},
+ "execution_count": 189,
"metadata": {},
- "execution_count": 189
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "# prompt: bitte teile die spalte df.Stats in zwei spalten an der stelle des ersten vorkommens von \"\\n\\n\\n\". nenne die erste neue spalte Antworten und die zweite neue Spalte Zugriffe\n",
+ "\n",
+ "# Split the 'Stats' column at the first occurrence of \"\\n\\n\\n\"\n",
+ "df[['Antworten', 'Zugriffe']] = df['Stats'].str.split('\\n\\n\\n', n=1, expand=True)\n",
+ "\n",
+ "# Display the updated DataFrame\n",
+ "df"
]
},
{
"cell_type": "code",
+ "execution_count": 190,
+ "metadata": {
+ "id": "-oyn7b69F07l"
+ },
+ "outputs": [],
"source": [
"# prompt: reinige die spalte df.Zugriffe indem nur der inhalt bis zum ersten vorkommen von \"\\n\\n\\n\" behalten wird\n",
"\n",
"# Clean the 'Zugriffe' column by keeping only the content up to the first occurrence of \"\\n\\n\\n\"\n",
"df['Zugriffe'] = df['Zugriffe'].str.split('\\n\\n\\n', n=1).str[0]"
- ],
- "metadata": {
- "id": "-oyn7b69F07l"
- },
- "execution_count": 190,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 191,
+ "metadata": {
+ "id": "lFQ6HAHWNeGo"
+ },
+ "outputs": [],
"source": [
"# prompt: für die spalten df.Antworten und df.Zugriffe splitte jeweils an der Stelle \"\\n\" und behalte jeweils die zweiten part\n",
"\n",
@@ -1544,93 +1548,81 @@
"\n",
"# Split the 'Zugriffe' column at the newline character '\\n' and keep the second part\n",
"df['Zugriffe'] = df['Zugriffe'].str.split('\\n').str[1]"
- ],
- "metadata": {
- "id": "lFQ6HAHWNeGo"
- },
- "execution_count": 191,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 192,
+ "metadata": {
+ "id": "gjCKICJdN6W_"
+ },
+ "outputs": [],
"source": [
"# prompt: in der spalte df.Subject trenne am ersten vorkommen von \"\\n\" in zwei separate spalten\n",
"\n",
"# Split the 'Subject' column at the first occurrence of \"\\n\"\n",
"df[['Subject_Part1', 'Subject_Part2']] = df['Subject'].str.split('\\n', n=1, expand=True)"
- ],
- "metadata": {
- "id": "gjCKICJdN6W_"
- },
- "execution_count": 192,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 193,
+ "metadata": {
+ "id": "GsAN9xwQOO4_"
+ },
+ "outputs": [],
"source": [
"# prompt: trenne die spalte df.Subject_Part2 an der stelle \"\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\" in zwei separate spalten\n",
"\n",
"# Split the 'Subject_Part2' column at the specified string\n",
"df[['Subject_Part2_1', 'Subject_Part2_2']] = df['Subject_Part2'].str.split('\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n', n=1, expand=True)"
- ],
- "metadata": {
- "id": "GsAN9xwQOO4_"
- },
- "execution_count": 193,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 194,
+ "metadata": {
+ "id": "DM7YEzcQOO2o"
+ },
+ "outputs": [],
"source": [
"# prompt: trenne die spalte df.Subject_Part2_2 in zwei separate spalten an der stelle \"\\n\\n\\n\"\n",
"\n",
"# Split the 'Subject_Part2_2' column at the specified string\n",
"df[['Subject_Part2_2_1', 'Subject_Part2_2_2']] = df['Subject_Part2_2'].str.split('\\n\\n\\n', n=1, expand=True)"
- ],
- "metadata": {
- "id": "DM7YEzcQOO2o"
- },
- "execution_count": 194,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 195,
+ "metadata": {
+ "id": "qcszoCCxOO0a"
+ },
+ "outputs": [],
"source": [
"# prompt: trenne die beiden spalten df.Subject_Part2_2_1\tund df.Subject_Part2_2_2 jeweils an der stelle \"\\n\" in separate spalten\n",
"\n",
"# Split the 'Subject_Part2_2_1' and 'Subject_Part2_2_2' columns at the newline character '\\n'\n",
"df[['Subject_Part2_2_1_split', 'Subject_Part2_2_1_rest']] = df['Subject_Part2_2_1'].str.split('\\n', n=1, expand=True)\n",
"df[['Subject_Part2_2_2_split', 'Subject_Part2_2_2_rest']] = df['Subject_Part2_2_2'].str.split('\\n', n=1, expand=True)"
- ],
- "metadata": {
- "id": "qcszoCCxOO0a"
- },
- "execution_count": 195,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 196,
+ "metadata": {
+ "id": "HKISyRY8OOyH"
+ },
+ "outputs": [],
"source": [
"# prompt: entferne die spalten Stats, LastPost, Subject_Part2, Subject_Part2_2, Subject_Part2_2_1, Subject und Subject_Part2_2_2 aus dataframe df\n",
"\n",
"df = df.drop(columns=['Stats', 'LastPost', 'Subject_Part2', 'Subject_Part2_2', 'Subject_Part2_2_1', 'Subject', 'Subject_Part2_2_2'])"
- ],
- "metadata": {
- "id": "HKISyRY8OOyH"
- },
- "execution_count": 196,
- "outputs": []
+ ]
},
{
"cell_type": "code",
- "source": [
- "# prompt: hat die spalte Antworten den gleichen inhalt wie die spalte Subject_Part2_1 ?\n",
- "\n",
- "# Check if the 'Antworten' column has the same content as the 'Subject_Part2_1' column\n",
- "comparison_result = df['Antworten'].equals(df['Subject_Part2_1'])\n",
- "\n",
- "print(f\"Do the 'Antworten' and 'Subject_Part2_1' columns have the same content?: {comparison_result}\")"
- ],
+ "execution_count": 197,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1638,29 +1630,27 @@
"id": "f7Lso2RQOOv3",
"outputId": "ac037d1e-9590-4660-aa3f-b027bddade71"
},
- "execution_count": 197,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Do the 'Antworten' and 'Subject_Part2_1' columns have the same content?: False\n"
]
}
+ ],
+ "source": [
+ "# prompt: hat die spalte Antworten den gleichen inhalt wie die spalte Subject_Part2_1 ?\n",
+ "\n",
+ "# Check if the 'Antworten' column has the same content as the 'Subject_Part2_1' column\n",
+ "comparison_result = df['Antworten'].equals(df['Subject_Part2_1'])\n",
+ "\n",
+ "print(f\"Do the 'Antworten' and 'Subject_Part2_1' columns have the same content?: {comparison_result}\")"
]
},
{
"cell_type": "code",
- "source": [
- "# prompt: haben die spalten user_lastpost und date_lastpost die gleichen inhalte wie Subject_Part2_2_2_split und Subject_Part2_2_2_rest\n",
- "\n",
- "# Check if 'user_lastpost' and 'date_lastpost' have the same content as 'Subject_Part2_2_2_split' and 'Subject_Part2_2_2_rest' respectively\n",
- "comparison_result_user = df['User_LastPost'].equals(df['Subject_Part2_2_2_split'])\n",
- "comparison_result_date = df['Date_LastPost'].equals(df['Subject_Part2_2_2_rest'])\n",
- "\n",
- "print(f\"Do the 'User_LastPost' and 'Subject_Part2_2_2_split' columns have the same content?: {comparison_result_user}\")\n",
- "print(f\"Do the 'Date_LastPost' and 'Subject_Part2_2_2_rest' columns have the same content?: {comparison_result_date}\")"
- ],
+ "execution_count": 198,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1668,20 +1658,34 @@
"id": "NzEuLDBLRQFP",
"outputId": "014e8ed7-9ff1-43f1-bb1d-5212710e0af4"
},
- "execution_count": 198,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Do the 'User_LastPost' and 'Subject_Part2_2_2_split' columns have the same content?: False\n",
"Do the 'Date_LastPost' and 'Subject_Part2_2_2_rest' columns have the same content?: False\n"
]
}
+ ],
+ "source": [
+ "# prompt: haben die spalten user_lastpost und date_lastpost die gleichen inhalte wie Subject_Part2_2_2_split und Subject_Part2_2_2_rest\n",
+ "\n",
+ "# Check if 'user_lastpost' and 'date_lastpost' have the same content as 'Subject_Part2_2_2_split' and 'Subject_Part2_2_2_rest' respectively\n",
+ "comparison_result_user = df['User_LastPost'].equals(df['Subject_Part2_2_2_split'])\n",
+ "comparison_result_date = df['Date_LastPost'].equals(df['Subject_Part2_2_2_rest'])\n",
+ "\n",
+ "print(f\"Do the 'User_LastPost' and 'Subject_Part2_2_2_split' columns have the same content?: {comparison_result_user}\")\n",
+ "print(f\"Do the 'Date_LastPost' and 'Subject_Part2_2_2_rest' columns have the same content?: {comparison_result_date}\")"
]
},
{
"cell_type": "code",
+ "execution_count": 199,
+ "metadata": {
+ "id": "uwPeVwC0RcF0"
+ },
+ "outputs": [],
"source": [
"# prompt: bitte nenne folgende spalten um im dataframe df : Subject_Part1 in \"Thema\", User_LastPost in \"Nutzer_Letzter_Post\", Date_LastPost in \"Datum_Letzter_Post\", Subject_Part2_1 in \"Antworten2\", Subject_Part2_2_1_split in \"Nutzer\", Subject_Part2_2_1_rest in \"Datum2\", Subject_Part2_2_2_split in \"Nutzer3\", Subject_Part2_2_2_rest in \"Datum3\"\n",
"\n",
@@ -1695,15 +1699,15 @@
" 'Subject_Part2_2_2_split': 'Nutzer3',\n",
" 'Subject_Part2_2_2_rest': 'Datum3'\n",
"})"
- ],
- "metadata": {
- "id": "uwPeVwC0RcF0"
- },
- "execution_count": 199,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 200,
+ "metadata": {
+ "id": "JWyNAmx5SHIG"
+ },
+ "outputs": [],
"source": [
"# prompt: bitte ändere die reihenfolge der spalten folgendermaßen: Thema, Nutzer_Letzter_Post, Datum_Letzter_Post, Antworten, Zugriffe, und dann die restlichen spalten\n",
"\n",
@@ -1718,15 +1722,15 @@
"\n",
"# Reorder the DataFrame columns\n",
"df = df[final_column_order]"
- ],
- "metadata": {
- "id": "JWyNAmx5SHIG"
- },
- "execution_count": 200,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 201,
+ "metadata": {
+ "id": "pJMHp6otU0k1"
+ },
+ "outputs": [],
"source": [
"# prompt: bitte formatiere die spalte df.Zugriffe von als ganze zahl, beachte die schreibweise mit , als tausendertrenner und k als abkürzung für tausend\n",
"\n",
@@ -1739,29 +1743,29 @@
"\n",
"# Convert the column to integers, handling potential errors\n",
"df['Zugriffe'] = pd.to_numeric(df['Zugriffe'], errors='coerce').fillna(0).astype(int)"
- ],
- "metadata": {
- "id": "pJMHp6otU0k1"
- },
- "execution_count": 201,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 202,
+ "metadata": {
+ "id": "_DRO85t8VOtJ"
+ },
+ "outputs": [],
"source": [
"# prompt: bitte ändere die spalte df.Antworten df.Antworten2 in ganze zahl\n",
"\n",
"# Convert 'Antworten' column to integers, handling errors\n",
"df['Antworten2'] = pd.to_numeric(df['Antworten2'], errors='coerce').fillna(0).astype(int)"
- ],
- "metadata": {
- "id": "_DRO85t8VOtJ"
- },
- "execution_count": 202,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": 216,
+ "metadata": {
+ "id": "bw45sFeDY1l5"
+ },
+ "outputs": [],
"source": [
"# prompt: extrahiere zahlen mit jahresformat in der spalte df.Datum_Letzter_Post und speichere sie in der Spalte \"Jahr_Letzter_Post\" und setzt die neue spalte als datentyp ganze zahl\n",
"\n",
@@ -1773,18 +1777,11 @@
"\n",
"# Convert the 'Jahr_Letzter_Post' column to integers\n",
"df['Jahr_Letzter_Post'] = pd.to_numeric(df['Jahr_Letzter_Post'], errors='coerce').astype('Int64')"
- ],
- "metadata": {
- "id": "bw45sFeDY1l5"
- },
- "execution_count": 216,
- "outputs": []
+ ]
},
{
"cell_type": "code",
- "source": [
- "df.info()"
- ],
+ "execution_count": 217,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1792,11 +1789,10 @@
"id": "mz2ENdOQVKAh",
"outputId": "6d8b10df-7de8-41b0-f03d-316c95550013"
},
- "execution_count": 217,
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"\n",
"RangeIndex: 2830 entries, 0 to 2829\n",
@@ -1818,13 +1814,14 @@
"memory usage: 246.1+ KB\n"
]
}
+ ],
+ "source": [
+ "df.info()"
]
},
{
"cell_type": "code",
- "source": [
- "df.T[0]"
- ],
+ "execution_count": 211,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1833,24 +1830,9 @@
"id": "ymlZ9_beVtCh",
"outputId": "25791ecd-c223-4cf0-fb69-eba0620fe046"
},
- "execution_count": 211,
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- "Thema ALG II Weiterbewilligung - wer soll in die Bed...\n",
- "Nutzer_Letzter_Post jsn73 \n",
- "Datum_Letzter_Post 3. Oktober 2017 um 19:18\n",
- "Antworten 3\n",
- "Zugriffe 87000\n",
- "Antworten2 3\n",
- "Nutzer2 jsn73\n",
- "Datum2 3. Oktober 2017 um 18:08\n",
- "Nutzer3 jsn73\n",
- "Datum3 3. Oktober 2017 um 19:18\n",
- "Name: 0, dtype: object"
- ],
"text/html": [
"\n",
"