diff --git a/fake_data/fake_med_records.ipynb b/fake_data/fake_med_records.ipynb index 0fd7c40..bb28700 100644 --- a/fake_data/fake_med_records.ipynb +++ b/fake_data/fake_med_records.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "58867898", "metadata": {}, "outputs": [ @@ -11,39 +11,33 @@ "output_type": "stream", "text": [ "Collecting pandas\n", - " Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)\n", + " Downloading pandas-2.3.3-cp310-cp310-win_amd64.whl.metadata (19 kB)\n", "Collecting numpy>=1.22.4 (from pandas)\n", - " Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)\n", + " Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)\n", "Collecting python-dateutil>=2.8.2 (from pandas)\n", " Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)\n", "Collecting pytz>=2020.1 (from pandas)\n", " Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", "Collecting tzdata>=2022.7 (from pandas)\n", - " Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)\n", + " Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)\n", "Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)\n", " Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)\n", - "Using cached pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)\n", - "Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)\n", + "Downloading pandas-2.3.3-cp310-cp310-win_amd64.whl (11.3 MB)\n", + " ---------------------------------------- 0.0/11.3 MB ? eta -:--:--\n", + " ------------------------------ --------- 8.7/11.3 MB 48.8 MB/s eta 0:00:01\n", + " ---------------------------------------- 11.3/11.3 MB 47.4 MB/s 0:00:00\n", + "Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB)\n", + " ---------------------------------------- 0.0/12.9 MB ? eta -:--:--\n", + " ---------------------------------------- 12.9/12.9 MB 73.5 MB/s 0:00:00\n", "Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)\n", "Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", "Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)\n", - "Using cached tzdata-2025.3-py2.py3-none-any.whl (348 kB)\n", + "Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)\n", "Installing collected packages: pytz, tzdata, six, numpy, python-dateutil, pandas\n", "\n", " ---------------------------------------- 0/6 [pytz]\n", " ---------------------------------------- 0/6 [pytz]\n", " ---------------------------------------- 0/6 [pytz]\n", - " ---------------------------------------- 0/6 [pytz]\n", - " Attempting uninstall: tzdata\n", - " ---------------------------------------- 0/6 [pytz]\n", - " Found existing installation: tzdata 2025.3\n", - " ---------------------------------------- 0/6 [pytz]\n", - " ------ --------------------------------- 1/6 [tzdata]\n", - " Uninstalling tzdata-2025.3:\n", - " ------ --------------------------------- 1/6 [tzdata]\n", - " Successfully uninstalled tzdata-2025.3\n", - " ------ --------------------------------- 1/6 [tzdata]\n", - " ------ --------------------------------- 1/6 [tzdata]\n", " ------ --------------------------------- 1/6 [tzdata]\n", " ------ --------------------------------- 1/6 [tzdata]\n", " ------ --------------------------------- 1/6 [tzdata]\n", @@ -56,6 +50,18 @@ " ------------- -------------------------- 2/6 [six]\n", " Successfully uninstalled six-1.17.0\n", " ------------- -------------------------- 2/6 [six]\n", + " ------------- -------------------------- 2/6 [six]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------- ------------------- 3/6 [numpy]\n", " -------------------- ------------------- 3/6 [numpy]\n", " -------------------- ------------------- 3/6 [numpy]\n", " -------------------- ------------------- 3/6 [numpy]\n", @@ -91,12 +97,52 @@ " -------------------- ------------------- 3/6 [numpy]\n", " Found existing installation: python-dateutil 2.9.0.post0\n", " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------------- ------------- 4/6 [python-dateutil]\n", " Uninstalling python-dateutil-2.9.0.post0:\n", - " -------------------- ------------------- 3/6 [numpy]\n", + " -------------------------- ------------- 4/6 [python-dateutil]\n", " Successfully uninstalled python-dateutil-2.9.0.post0\n", - " -------------------- ------------------- 3/6 [numpy]\n", " -------------------------- ------------- 4/6 [python-dateutil]\n", " -------------------------- ------------- 4/6 [python-dateutil]\n", + " -------------------------- ------------- 4/6 [python-dateutil]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", + " --------------------------------- ------ 5/6 [pandas]\n", " --------------------------------- ------ 5/6 [pandas]\n", " --------------------------------- ------ 5/6 [pandas]\n", " --------------------------------- ------ 5/6 [pandas]\n", @@ -169,10 +215,12 @@ "Successfully installed numpy-2.2.6 pandas-2.3.3 python-dateutil-2.9.0.post0 pytz-2025.2 six-1.17.0 tzdata-2025.3\n", "Note: you may need to restart the kernel to use updated packages.\n", "Collecting faker\n", - " Using cached faker-39.0.0-py3-none-any.whl.metadata (16 kB)\n", + " Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)\n", "Collecting tzdata (from faker)\n", " Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)\n", - "Using cached faker-39.0.0-py3-none-any.whl (2.0 MB)\n", + "Downloading faker-39.0.0-py3-none-any.whl (2.0 MB)\n", + " ---------------------------------------- 0.0/2.0 MB ? eta -:--:--\n", + " ---------------------------------------- 2.0/2.0 MB 15.8 MB/s 0:00:00\n", "Using cached tzdata-2025.3-py2.py3-none-any.whl (348 kB)\n", "Installing collected packages: tzdata, faker\n", "\n", @@ -186,15 +234,26 @@ "\n", " ---------------------------------------- 0/2 [tzdata]\n", " ---------------------------------------- 0/2 [tzdata]\n", - " Attempting uninstall: faker\n", " ---------------------------------------- 0/2 [tzdata]\n", - " Found existing installation: Faker 39.0.0\n", " ---------------------------------------- 0/2 [tzdata]\n", " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", - " Uninstalling Faker-39.0.0:\n", " -------------------- ------------------- 1/2 [faker]\n", - " Successfully uninstalled Faker-39.0.0\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", + " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", " -------------------- ------------------- 1/2 [faker]\n", @@ -241,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 3, "id": "d786ab3d", "metadata": {}, "outputs": [ @@ -249,7 +308,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "c:\\Users\\leaf3\\OneDrive\\Desktop\\DataLab\\Data_lab\\fake_data\n" + "c:\\Users\\leaf3\\Desktop\\Data Engineer\\datalab\\Data_Lab\\fake_data\n" ] } ], @@ -269,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "46f10552", "metadata": {}, "outputs": [ @@ -277,7 +336,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Data Exported\n" + "Saved files to:\n", + "c:\\Users\\leaf3\\Desktop\\Data Engineer\\datalab\\Data_Lab\\fake_data\\healthy_medical_records.csv\n", + "c:\\Users\\leaf3\\Desktop\\Data Engineer\\datalab\\Data_Lab\\fake_data\\mixed_medical_records.csv\n" ] } ], @@ -475,12 +536,292 @@ "for column in mixed_df.columns:\n", " mixed_df.loc[healthy_mask, column] = healthy_df.loc[healthy_mask, column]\n", "\n", - "mixed_df = generate_dataset(is_healthy=False)\n", + "mixed_df.to_csv(mixed_path, index=False)\n", "\n", "print(\"Saved files to:\")\n", "print(healthy_path)\n", "print(mixed_path)" ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0b9c67b5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Medical datasets combined and exported successfully.\n" + ] + } + ], + "source": [ + "# Load the datasets\n", + "healthy_df = pd.read_csv('healthy_medical_records.csv')\n", + "mixed_df = pd.read_csv('mixed_medical_records.csv')\n", + "\n", + "# Concatenate the two DataFrames\n", + "output_df = pd.concat([healthy_df, mixed_df], ignore_index=True)\n", + "\n", + "output_df.fillna('None', inplace=True)\n", + "\n", + "# Export the final dataset to a CSV file\n", + "output_df.to_csv('sample_medical_records.csv', index=False)\n", + "\n", + "print(\"Medical datasets combined and exported successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "928bf039", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Patient IDLast CheckupDate of BirthGenderEthnicityBlood TypeOccupationInsurance ProviderInsurance PlanMonthly Premium...StrokeCoronary Heart Disease (CHD)Blood Glucose Level (mmol/L)HDL Cholesterol (mmol/L)LDL Cholesterol (mmol/L)Triglycerides (mmol/L)Hemoglobin A1C (%)White Blood Cell Count (10^9/L)Red Blood Cell Count (10^12/L)Platelet Count (10^9/L)
0KP-6511-3073-22782024-05-281955-08-02FemalePacific IslanderB+ElectricianPugh, Tate and GreenComprehensive Care Plan722.18...NoNo5.01.42.71.45.49.14.8310.5
1RK-1845-1516-42392023-05-291950-09-27MaleAsianO+NurseGonzalez-MartinezFamily Health Coverage610.27...NoNo5.01.32.20.75.05.14.6226.1
2XA-8453-5992-51652025-09-281958-06-22MaleOtherO-EngineerKim, Medina and HawkinsHigh Deductible Health Plan652.52...NoNo4.21.32.91.34.25.84.7274.4
3LX-9887-6375-72802025-05-242005-02-21MaleOtherB+Customer Service RepresentativeRogers-BakerPremium Health Insurance290.39...NoNo5.11.62.10.74.410.74.6154.0
4WM-9463-4744-15272023-06-191994-02-23MaleNative AmericanO+StudentNaNIndividual Health Insurance257.43...NoNo4.71.42.71.24.35.34.9273.1
\n", + "

5 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " Patient ID Last Checkup Date of Birth Gender Ethnicity \\\n", + "0 KP-6511-3073-2278 2024-05-28 1955-08-02 Female Pacific Islander \n", + "1 RK-1845-1516-4239 2023-05-29 1950-09-27 Male Asian \n", + "2 XA-8453-5992-5165 2025-09-28 1958-06-22 Male Other \n", + "3 LX-9887-6375-7280 2025-05-24 2005-02-21 Male Other \n", + "4 WM-9463-4744-1527 2023-06-19 1994-02-23 Male Native American \n", + "\n", + " Blood Type Occupation Insurance Provider \\\n", + "0 B+ Electrician Pugh, Tate and Green \n", + "1 O+ Nurse Gonzalez-Martinez \n", + "2 O- Engineer Kim, Medina and Hawkins \n", + "3 B+ Customer Service Representative Rogers-Baker \n", + "4 O+ Student NaN \n", + "\n", + " Insurance Plan Monthly Premium ... Stroke \\\n", + "0 Comprehensive Care Plan 722.18 ... No \n", + "1 Family Health Coverage 610.27 ... No \n", + "2 High Deductible Health Plan 652.52 ... No \n", + "3 Premium Health Insurance 290.39 ... No \n", + "4 Individual Health Insurance 257.43 ... No \n", + "\n", + " Coronary Heart Disease (CHD) Blood Glucose Level (mmol/L) \\\n", + "0 No 5.0 \n", + "1 No 5.0 \n", + "2 No 4.2 \n", + "3 No 5.1 \n", + "4 No 4.7 \n", + "\n", + " HDL Cholesterol (mmol/L) LDL Cholesterol (mmol/L) Triglycerides (mmol/L) \\\n", + "0 1.4 2.7 1.4 \n", + "1 1.3 2.2 0.7 \n", + "2 1.3 2.9 1.3 \n", + "3 1.6 2.1 0.7 \n", + "4 1.4 2.7 1.2 \n", + "\n", + " Hemoglobin A1C (%) White Blood Cell Count (10^9/L) \\\n", + "0 5.4 9.1 \n", + "1 5.0 5.1 \n", + "2 4.2 5.8 \n", + "3 4.4 10.7 \n", + "4 4.3 5.3 \n", + "\n", + " Red Blood Cell Count (10^12/L) Platelet Count (10^9/L) \n", + "0 4.8 310.5 \n", + "1 4.6 226.1 \n", + "2 4.7 274.4 \n", + "3 4.6 154.0 \n", + "4 4.9 273.1 \n", + "\n", + "[5 rows x 29 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('sample_medical_records.csv')\n", + "\n", + "\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "899a271c", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {