diff --git a/fake_data/fake_med_records.ipynb b/fake_data/fake_med_records.ipynb index 0fd7c40..bb28700 100644 --- a/fake_data/fake_med_records.ipynb +++ b/fake_data/fake_med_records.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "58867898", "metadata": {}, "outputs": [ @@ -11,39 +11,33 @@ "output_type": "stream", "text": [ "Collecting pandas\n", - " Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)\n", + " Downloading pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)\n", "Collecting numpy>=1.22.4 (from pandas)\n", - " Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)\n", + " Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)\n", "Collecting python-dateutil>=2.8.2 (from pandas)\n", " Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n", "Collecting pytz>=2020.1 (from pandas)\n", " Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", "Collecting tzdata>=2022.7 (from pandas)\n", - " Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)\n", + " Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)\n", "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)\n", " Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)\n", - "Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)\n", - "Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)\n", + "Downloading pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)\n", + " ---------------------------------------- 0.0/11.3 MB ? eta -:--:--\n", + " ------------------------------ --------- 8.7/11.3 MB 48.8 MB/s eta 0:00:01\n", + " ---------------------------------------- 11.3/11.3 MB 47.4 MB/s 0:00:00\n", + "Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)\n", + " ---------------------------------------- 0.0/12.9 MB ? eta -:--:--\n", + " ---------------------------------------- 12.9/12.9 MB 73.5 MB/s 0:00:00\n", "Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n", "Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", "Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)\n", - "Using cached tzdata-2025.3-py2.py3-none-any.whl (348 kB)\n", + "Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)\n", "Installing collected packages: pytz, tzdata, six, numpy, python-dateutil, pandas\n", "\n", " ---------------------------------------- 0/6 [pytz]\n", " ---------------------------------------- 0/6 [pytz]\n", " ---------------------------------------- 0/6 [pytz]\n", - " ---------------------------------------- 0/6 [pytz]\n", - " Attempting uninstall: tzdata\n", - " ---------------------------------------- 0/6 [pytz]\n", - " Found existing installation: tzdata 2025.3\n", - " ---------------------------------------- 0/6 [pytz]\n", - " ------ --------------------------------- 1/6 [tzdata]\n", - " Uninstalling tzdata-2025.3:\n", - " ------ --------------------------------- 1/6 [tzdata]\n", - " Successfully uninstalled tzdata-2025.3\n", - " ------ --------------------------------- 1/6 [tzdata]\n", - " ------ --------------------------------- 1/6 [tzdata]\n", " ------ --------------------------------- 1/6 [tzdata]\n", " ------ --------------------------------- 1/6 [tzdata]\n", " ------ --------------------------------- 1/6 [tzdata]\n", @@ -56,6 +50,18 @@ " ------------- -------------------------- 2/6 [six]\n", " Successfully uninstalled six-1.17.0\n", " ------------- -------------------------- 2/6 [six]\n", + " ------------- -------------------------- 2/6 [six]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", " -------------------- ------------------- 3/6 [numpy]\n", " -------------------- ------------------- 3/6 [numpy]\n", " -------------------- ------------------- 3/6 [numpy]\n", @@ -91,12 +97,52 @@ " -------------------- ------------------- 3/6 [numpy]\n", " Found existing installation: python-dateutil 2.9.0.post0\n", " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------------- ------------- 4/6 [python-dateutil]\n", " Uninstalling python-dateutil-2.9.0.post0:\n", - " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------------- ------------- 4/6 [python-dateutil]\n", " Successfully uninstalled python-dateutil-2.9.0.post0\n", - " -------------------- ------------------- 3/6 [numpy]\n", " -------------------------- ------------- 4/6 [python-dateutil]\n", " -------------------------- ------------- 4/6 [python-dateutil]\n", + " -------------------------- ------------- 4/6 [python-dateutil]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", " --------------------------------- ------ 5/6 [pandas]\n", " --------------------------------- ------ 5/6 [pandas]\n", " --------------------------------- ------ 5/6 [pandas]\n", @@ -169,10 +215,12 @@ "Successfully installed numpy-2.2.6 pandas-2.3.3 python-dateutil-2.9.0.post0 pytz-2025.2 six-1.17.0 tzdata-2025.3\n", "Note: you may need to restart the kernel to use updated packages.\n", "Collecting faker\n", - " Using cached faker-39.0.0-py3-none-any.whl.metadata (16 kB)\n", + " Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)\n", "Collecting tzdata (from faker)\n", " Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Using cached faker-39.0.0-py3-none-any.whl (2.0 MB)\n", + "Downloading faker-39.0.0-py3-none-any.whl (2.0 MB)\n", + " ---------------------------------------- 0.0/2.0 MB ? eta -:--:--\n", + " ---------------------------------------- 2.0/2.0 MB 15.8 MB/s 0:00:00\n", "Using cached tzdata-2025.3-py2.py3-none-any.whl (348 kB)\n", "Installing collected packages: tzdata, faker\n", "\n", @@ -186,15 +234,26 @@ "\n", " ---------------------------------------- 0/2 [tzdata]\n", " ---------------------------------------- 0/2 [tzdata]\n", - " Attempting uninstall: faker\n", " ---------------------------------------- 0/2 [tzdata]\n", - " Found existing installation: Faker 39.0.0\n", " ---------------------------------------- 0/2 [tzdata]\n", " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", - " Uninstalling Faker-39.0.0:\n", " -------------------- ------------------- 1/2 [faker]\n", - " Successfully uninstalled Faker-39.0.0\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", @@ -241,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "d786ab3d", "metadata": {}, "outputs": [ @@ -249,7 +308,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "c:\\Users\\leaf3\\OneDrive\\Desktop\\DataLab\\Data_lab\\fake_data\n" + "c:\\Users\\leaf3\\Desktop\\Data Engineer\\datalab\\Data_Lab\\fake_data\n" ] } ], @@ -269,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "46f10552", "metadata": {}, "outputs": [ @@ -277,7 +336,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data Exported\n" + "Saved files to:\n", + "c:\\Users\\leaf3\\Desktop\\Data Engineer\\datalab\\Data_Lab\\fake_data\\healthy_medical_records.csv\n", + "c:\\Users\\leaf3\\Desktop\\Data Engineer\\datalab\\Data_Lab\\fake_data\\mixed_medical_records.csv\n" ] } ], @@ -475,12 +536,292 @@ "for column in mixed_df.columns:\n", " mixed_df.loc[healthy_mask, column] = healthy_df.loc[healthy_mask, column]\n", "\n", - "mixed_df = generate_dataset(is_healthy=False)\n", + "mixed_df.to_csv(mixed_path, index=False)\n", "\n", "print(\"Saved files to:\")\n", "print(healthy_path)\n", "print(mixed_path)" ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0b9c67b5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Medical datasets combined and exported successfully.\n" + ] + } + ], + "source": [ + "# Load the datasets\n", + "healthy_df = pd.read_csv('healthy_medical_records.csv')\n", + "mixed_df = pd.read_csv('mixed_medical_records.csv')\n", + "\n", + "# Concatenate the two DataFrames\n", + "output_df = pd.concat([healthy_df, mixed_df], ignore_index=True)\n", + "\n", + "output_df.fillna('None', inplace=True)\n", + "\n", + "# Export the final dataset to a CSV file\n", + "output_df.to_csv('sample_medical_records.csv', index=False)\n", + "\n", + "print(\"Medical datasets combined and exported successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "928bf039", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Patient ID | \n", + "Last Checkup | \n", + "Date of Birth | \n", + "Gender | \n", + "Ethnicity | \n", + "Blood Type | \n", + "Occupation | \n", + "Insurance Provider | \n", + "Insurance Plan | \n", + "Monthly Premium | \n", + "... | \n", + "Stroke | \n", + "Coronary Heart Disease (CHD) | \n", + "Blood Glucose Level (mmol/L) | \n", + "HDL Cholesterol (mmol/L) | \n", + "LDL Cholesterol (mmol/L) | \n", + "Triglycerides (mmol/L) | \n", + "Hemoglobin A1C (%) | \n", + "White Blood Cell Count (10^9/L) | \n", + "Red Blood Cell Count (10^12/L) | \n", + "Platelet Count (10^9/L) | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "KP-6511-3073-2278 | \n", + "2024-05-28 | \n", + "1955-08-02 | \n", + "Female | \n", + "Pacific Islander | \n", + "B+ | \n", + "Electrician | \n", + "Pugh, Tate and Green | \n", + "Comprehensive Care Plan | \n", + "722.18 | \n", + "... | \n", + "No | \n", + "No | \n", + "5.0 | \n", + "1.4 | \n", + "2.7 | \n", + "1.4 | \n", + "5.4 | \n", + "9.1 | \n", + "4.8 | \n", + "310.5 | \n", + "
| 1 | \n", + "RK-1845-1516-4239 | \n", + "2023-05-29 | \n", + "1950-09-27 | \n", + "Male | \n", + "Asian | \n", + "O+ | \n", + "Nurse | \n", + "Gonzalez-Martinez | \n", + "Family Health Coverage | \n", + "610.27 | \n", + "... | \n", + "No | \n", + "No | \n", + "5.0 | \n", + "1.3 | \n", + "2.2 | \n", + "0.7 | \n", + "5.0 | \n", + "5.1 | \n", + "4.6 | \n", + "226.1 | \n", + "
| 2 | \n", + "XA-8453-5992-5165 | \n", + "2025-09-28 | \n", + "1958-06-22 | \n", + "Male | \n", + "Other | \n", + "O- | \n", + "Engineer | \n", + "Kim, Medina and Hawkins | \n", + "High Deductible Health Plan | \n", + "652.52 | \n", + "... | \n", + "No | \n", + "No | \n", + "4.2 | \n", + "1.3 | \n", + "2.9 | \n", + "1.3 | \n", + "4.2 | \n", + "5.8 | \n", + "4.7 | \n", + "274.4 | \n", + "
| 3 | \n", + "LX-9887-6375-7280 | \n", + "2025-05-24 | \n", + "2005-02-21 | \n", + "Male | \n", + "Other | \n", + "B+ | \n", + "Customer Service Representative | \n", + "Rogers-Baker | \n", + "Premium Health Insurance | \n", + "290.39 | \n", + "... | \n", + "No | \n", + "No | \n", + "5.1 | \n", + "1.6 | \n", + "2.1 | \n", + "0.7 | \n", + "4.4 | \n", + "10.7 | \n", + "4.6 | \n", + "154.0 | \n", + "
| 4 | \n", + "WM-9463-4744-1527 | \n", + "2023-06-19 | \n", + "1994-02-23 | \n", + "Male | \n", + "Native American | \n", + "O+ | \n", + "Student | \n", + "NaN | \n", + "Individual Health Insurance | \n", + "257.43 | \n", + "... | \n", + "No | \n", + "No | \n", + "4.7 | \n", + "1.4 | \n", + "2.7 | \n", + "1.2 | \n", + "4.3 | \n", + "5.3 | \n", + "4.9 | \n", + "273.1 | \n", + "
5 rows × 29 columns
\n", + "