{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30673,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n\n# Define the specifications for each column\nlevel = np.random.choice([200, 300, 400], 200000)\ncourse_units = np.random.randint(1, 4, 200000)\nattendance = np.random.randint(1, 11, 200000)\nmid_semester = np.random.randint(1, 21, 200000)\nassignments = np.random.randint(1, 11, 200000)\nexam = np.random.randint(1, 61, 200000)\n\n# Create a DataFrame with the generated data\ndata = {\n 'Level': level,\n 'Course Units': course_units,\n 'Attendance': attendance,\n 'Mid Semester': mid_semester,\n 'Assignments': assignments,\n 'Exam': exam\n}\n\ndf = pd.DataFrame(data)\n\n# Save the generated dataset to a CSV file\ndf.to_csv('generated_dataset.csv', index=False)","metadata":{"_uuid":"f1a5d48a-a556-437e-af80-a48817349c1e","_cell_guid":"1dcbfd44-e3b1-4850-bc84-2939cb202ddb","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2024-04-05T19:37:58.125621Z","iopub.execute_input":"2024-04-05T19:37:58.126183Z","iopub.status.idle":"2024-04-05T19:37:58.490623Z","shell.execute_reply.started":"2024-04-05T19:37:58.126132Z","shell.execute_reply":"2024-04-05T19:37:58.489436Z"},"trusted":true},"execution_count":32,"outputs":[]},{"cell_type":"code","source":"!pip install -U scikit-learn\n","metadata":{"execution":{"iopub.status.busy":"2024-04-05T19:37:58.492553Z","iopub.execute_input":"2024-04-05T19:37:58.492905Z","iopub.status.idle":"2024-04-05T19:38:17.175537Z","shell.execute_reply.started":"2024-04-05T19:37:58.492875Z","shell.execute_reply":"2024-04-05T19:38:17.173926Z"},"trusted":true},"execution_count":33,"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/pty.py:89: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n pid, fd = os.forkpty()\n","output_type":"stream"},{"name":"stdout","text":"Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (1.2.2)\nCollecting scikit-learn\n Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\nRequirement already satisfied: numpy<2.0,>=1.19.5 in /opt/conda/lib/python3.10/site-packages (from scikit-learn) (1.26.4)\nRequirement already satisfied: scipy>=1.6.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn) (1.11.4)\nRequirement already satisfied: joblib>=1.2.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn) (1.3.2)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn) (3.2.0)\nDownloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m72.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n\u001b[?25hInstalling collected packages: scikit-learn\n Attempting uninstall: scikit-learn\n Found existing installation: scikit-learn 1.2.2\n Uninstalling scikit-learn-1.2.2:\n Successfully uninstalled scikit-learn-1.2.2\n\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nspopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.\u001b[0m\u001b[31m\n\u001b[0mSuccessfully installed scikit-learn-1.4.1.post1\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# BELL CURVE DATA","metadata":{"_uuid":"78b449ba-833f-4748-9c86-c2a5ce5870a2","_cell_guid":"28346c8d-e121-4019-ba61-c9394c903f24","trusted":true}},{"cell_type":"code","source":"import pandas as pd\nimport numpy as np\n\n# Define the mean and standard deviation for each column\nlevel_mean = 300\nlevel_std = 50\ncourse_units_mean = 2\ncourse_units_std = 0.5\nattendance_mean = 5\nattendance_std = 2\nmid_semester_mean = 10\nmid_semester_std = 3\nassignments_mean = 5\nassignments_std = 2\nexam_mean = 30\nexam_std = 10\n\n# Define the covariance matrix\ncov_matrix = np.array([\n [level_std**2, 0, 0, 0, 0, 0],\n [0, course_units_std**2, 0, 0, 0, 0],\n [0, 0, attendance_std**2, 0, 0, 0.9 * attendance_std * exam_std],\n [0, 0, 0, mid_semester_std**2, 0, 0.7 * mid_semester_std * exam_std],\n [0, 0, 0, 0, assignments_std**2, 0.5 * assignments_std * exam_std],\n [0, 0, 0.9 * attendance_std * exam_std, 0.7 * mid_semester_std * exam_std, 0.5 * assignments_std * exam_std, exam_std**2]\n])\n\n# Generate correlated random variables\ncorrelated_vars = np.random.multivariate_normal(\n [level_mean, course_units_mean, attendance_mean, mid_semester_mean, assignments_mean, exam_mean],\n cov_matrix,\n 500000\n)\n\n# Extract the individual variables\nlevel = np.round(correlated_vars[:, 0] / 100) * 100 # Round to the nearest 100\ncourse_units = correlated_vars[:, 1].astype(int)\nattendance = correlated_vars[:, 2].astype(int)\nmid_semester = correlated_vars[:, 3].astype(int)\nassignments = correlated_vars[:, 4].astype(int)\nexam = correlated_vars[:, 5].astype(int)\n\n# Clip the values to the desired range\nlevel = np.clip(level, 200, 400)\ncourse_units = np.clip(course_units, 1, 3)\nattendance = np.clip(attendance, 1, 10)\nmid_semester = np.clip(mid_semester, 3, 20)\nassignments = np.clip(assignments, 2, 10)\nexam = np.clip(exam, 15, 60)\n\n# Create a DataFrame with the generated data\ndata = {\n 'Level': level,\n 'Course Units': course_units,\n 'Attendance': attendance,\n 'Mid Semester': mid_semester,\n 'Assignments': assignments,\n 'Exam': exam\n}\n\ndf = pd.DataFrame(data)\n\n# Save the generated dataset to a CSV file\ndf.to_csv('generated_dataset.csv', index=False)","metadata":{"_uuid":"229b50eb-5d41-411a-9d6d-c83f24015790","_cell_guid":"ee2d6ae8-a49e-415a-817d-044d97438b25","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2024-04-05T19:38:17.181102Z","iopub.execute_input":"2024-04-05T19:38:17.181543Z","iopub.status.idle":"2024-04-05T19:38:18.456526Z","shell.execute_reply.started":"2024-04-05T19:38:17.181508Z","shell.execute_reply":"2024-04-05T19:38:18.455090Z"},"trusted":true},"execution_count":34,"outputs":[{"name":"stderr","text":"/tmp/ipykernel_33/3474468994.py:29: RuntimeWarning: covariance is not symmetric positive-semidefinite.\n correlated_vars = np.random.multivariate_normal(\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# CREATING CORRELATION","metadata":{"_uuid":"bb22047a-f98b-48af-9e08-7ad05f5a8e75","_cell_guid":"6e35a962-3c25-4989-aaff-f90f1361067c","trusted":true}},{"cell_type":"code","source":"# import pandas as pd\n# import numpy as np\n\n# # Define the mean and standard deviation for each column\n# level_mean = 300\n# level_std = 50\n# course_units_mean = 2\n# course_units_std = 0.5\n# attendance_mean = 5\n# attendance_std = 2\n# mid_semester_mean = 10\n# mid_semester_std = 3\n# assignments_mean = 5\n# assignments_std = 2\n# exam_mean = 30\n# exam_std = 10\n\n# # Define the covariance matrix\n# cov_matrix = np.array([\n# [level_std**2, 0, 0, 0, 0, 0],\n# [0, course_units_std**2, 0, 0, 0, -0.3 * course_units_std * exam_std],\n# [0, 0, attendance_std**2, 0, 0.5 * attendance_std * exam_std, 0.5 * attendance_std * exam_std],\n# [0, 0, 0, mid_semester_std**2, 0.4 * mid_semester_std * exam_std, 0.4 * mid_semester_std * exam_std],\n# [0, 0, 0.5 * attendance_std * exam_std, 0.4 * mid_semester_std * exam_std, assignments_std**2, 0.7 * assignments_std * exam_std],\n# [0, -0.3 * course_units_std * exam_std, 0.5 * attendance_std * exam_std, 0.4 * mid_semester_std * exam_std, 0.7 * assignments_std * exam_std, exam_std**2]\n# ])\n\n# # Generate correlated random variables\n# correlated_vars = np.random.multivariate_normal(\n# [level_mean, course_units_mean, attendance_mean, mid_semester_mean, assignments_mean, exam_mean],\n# cov_matrix,\n# 500000\n# )\n\n# # Extract the individual variables\n# level = np.round(correlated_vars[:, 0] / 100) * 100 # Round to the nearest 100\n# course_units = correlated_vars[:, 1].astype(int)\n# attendance = correlated_vars[:, 2].astype(int)\n# mid_semester = correlated_vars[:, 3].astype(int)\n# assignments = correlated_vars[:, 4].astype(int)\n# exam = correlated_vars[:, 5].astype(int)\n\n# # Generate random gender values\n# gender = np.random.choice([0, 1], size=500000)\n\n# # Clip the values to the desired range\n# level = np.clip(level, 200, 400)\n# course_units = np.clip(course_units, 1, 3)\n# attendance = np.clip(attendance, 1, 10)\n# mid_semester = np.clip(mid_semester, 1, 20)\n# assignments = np.clip(assignments, 1, 10)\n# exam = np.clip(exam, 1, 60)\n\n# # Create a DataFrame with the generated data\n# data = {\n# 'Level': level,\n# 'Course Units': course_units,\n# 'Attendance': attendance,\n# 'Mid Semester': mid_semester,\n# 'Assignments': assignments,\n# 'Exam': exam,\n# 'Gender': gender\n# }\n\n# df = pd.DataFrame(data)\n\n# # Save the generated dataset to a CSV file\n# df.to_csv('generated_dataset.csv', index=False)","metadata":{"_uuid":"e712bed9-3e6d-4fb5-b64e-e320764bec7c","_cell_guid":"6aa0b6e5-3083-404e-899e-edff60a825a8","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2024-04-05T19:38:18.460527Z","iopub.execute_input":"2024-04-05T19:38:18.461088Z","iopub.status.idle":"2024-04-05T19:38:18.468983Z","shell.execute_reply.started":"2024-04-05T19:38:18.461037Z","shell.execute_reply":"2024-04-05T19:38:18.467583Z"},"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"level = np.round(correlated_vars[:, 0] / 100) * 100 # Round to the nearest 100","metadata":{"_uuid":"5229e9e5-4be2-4237-a6d2-15b72667cd51","_cell_guid":"40206c97-66aa-4de6-8ea0-ab195b86ded9","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2024-04-05T19:38:18.470698Z","iopub.execute_input":"2024-04-05T19:38:18.471450Z","iopub.status.idle":"2024-04-05T19:38:18.488706Z","shell.execute_reply.started":"2024-04-05T19:38:18.471413Z","shell.execute_reply":"2024-04-05T19:38:18.487331Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"df","metadata":{"_uuid":"3ce1f5b5-56f8-4293-87c2-b68ef6ec43d8","_cell_guid":"50c4328d-af03-4fa8-9e9b-19564ce1a7c7","collapsed":false,"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2024-04-05T19:38:18.490206Z","iopub.execute_input":"2024-04-05T19:38:18.490582Z","iopub.status.idle":"2024-04-05T19:38:18.507944Z","shell.execute_reply.started":"2024-04-05T19:38:18.490550Z","shell.execute_reply":"2024-04-05T19:38:18.506562Z"},"trusted":true},"execution_count":37,"outputs":[{"execution_count":37,"output_type":"execute_result","data":{"text/plain":" Level Course Units Attendance Mid Semester Assignments Exam\n0 400.0 1 8 7 9 35\n1 300.0 1 1 3 2 15\n2 300.0 1 5 9 3 21\n3 300.0 1 6 7 5 25\n4 400.0 2 1 8 4 15\n... ... ... ... ... ... ...\n499995 200.0 2 4 10 5 28\n499996 300.0 2 7 16 8 40\n499997 300.0 2 3 11 6 34\n499998 400.0 1 4 5 3 35\n499999 300.0 2 6 14 9 42\n\n[500000 rows x 6 columns]","text/html":"
\n | Level | \nCourse Units | \nAttendance | \nMid Semester | \nAssignments | \nExam | \n
---|---|---|---|---|---|---|
0 | \n400.0 | \n1 | \n8 | \n7 | \n9 | \n35 | \n
1 | \n300.0 | \n1 | \n1 | \n3 | \n2 | \n15 | \n
2 | \n300.0 | \n1 | \n5 | \n9 | \n3 | \n21 | \n
3 | \n300.0 | \n1 | \n6 | \n7 | \n5 | \n25 | \n
4 | \n400.0 | \n2 | \n1 | \n8 | \n4 | \n15 | \n
... | \n... | \n... | \n... | \n... | \n... | \n... | \n
499995 | \n200.0 | \n2 | \n4 | \n10 | \n5 | \n28 | \n
499996 | \n300.0 | \n2 | \n7 | \n16 | \n8 | \n40 | \n
499997 | \n300.0 | \n2 | \n3 | \n11 | \n6 | \n34 | \n
499998 | \n400.0 | \n1 | \n4 | \n5 | \n3 | \n35 | \n
499999 | \n300.0 | \n2 | \n6 | \n14 | \n9 | \n42 | \n
500000 rows × 6 columns
\n\n | Level | \nCourse Units | \nAttendance | \nMid Semester | \nAssignments | \nExam | \nTotal | \n
---|---|---|---|---|---|---|---|
0 | \n400.0 | \n1 | \n8 | \n7 | \n9 | \n35 | \n59 | \n
1 | \n300.0 | \n1 | \n1 | \n3 | \n2 | \n15 | \n21 | \n
2 | \n300.0 | \n1 | \n5 | \n9 | \n3 | \n21 | \n38 | \n
3 | \n300.0 | \n1 | \n6 | \n7 | \n5 | \n25 | \n43 | \n
4 | \n400.0 | \n2 | \n1 | \n8 | \n4 | \n15 | \n28 | \n