import IPython
print(IPython.sys_info())

{'commit_hash': '3813660de',
 'commit_source': 'installation',
 'default_encoding': 'utf-8',
 'ipython_path': '/Users/kose/opt/anaconda3/lib/python3.9/site-packages/IPython',
 'ipython_version': '7.29.0',
 'os_name': 'posix',
 'platform': 'macOS-10.16-x86_64-i386-64bit',
 'sys_executable': '/Users/kose/opt/anaconda3/bin/python',
 'sys_platform': 'darwin',
 'sys_version': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]'}


# load the pandas library
import pandas as pd
# we also load numpy for array computation for convenience
import numpy as np
import platform
mimic_path = "/Users/huazhou/Desktop/mimic-iv-1.0" if platform.uname().node == "RELMDOMFAC30610" else "/home/shared/1.0"


icustays_df = pd.read_csv(mimic_path + "/icu/icustays.csv.gz", parse_dates=["intime", "outtime"])
print(icustays_df)

       subject_id   hadm_id   stay_id  \
0        17867402  24528534  31793211   
1        14435996  28960964  31983544   
2        17609946  27385897  33183475   
3        18966770  23483021  34131444   
4        12776735  20817525  34547665   
...           ...       ...       ...   
76535    15368898  27299174  39990887   
76536    15721773  28911582  39991872   
76537    12275003  22562812  39992247   
76538    17577670  24221219  39993265   
76539    17840864  22695803  39999810   

                                         first_careunit  \
0                                   Trauma SICU (TSICU)   
1                                   Trauma SICU (TSICU)   
2                                   Trauma SICU (TSICU)   
3                                   Trauma SICU (TSICU)   
4                                        Neuro Stepdown   
...                                                 ...   
76535                               Trauma SICU (TSICU)   
76536                Medical Intensive Care Unit (MICU)   
76537      Cardiac Vascular Intensive Care Unit (CVICU)   
76538  Medical/Surgical Intensive Care Unit (MICU/SICU)   
76539                                Neuro Intermediate   

                                          last_careunit              intime  \
0                                   Trauma SICU (TSICU) 2154-03-03 04:11:00   
1                                   Trauma SICU (TSICU) 2150-06-19 17:57:00   
2                                   Trauma SICU (TSICU) 2138-02-05 18:54:00   
3                                   Trauma SICU (TSICU) 2123-10-25 10:35:00   
4                                        Neuro Stepdown 2200-07-12 00:33:00   
...                                                 ...                 ...   
76535                               Trauma SICU (TSICU) 2126-06-13 01:00:00   
76536                Medical Intensive Care Unit (MICU) 2177-11-08 14:09:00   
76537      Cardiac Vascular Intensive Care Unit (CVICU) 2182-08-15 09:37:33   
76538  Medical/Surgical Intensive Care Unit (MICU/SICU) 2154-01-03 22:50:26   
76539                                Neuro Intermediate 2115-12-01 00:37:00   

                  outtime       los  
0     2154-03-04 18:16:56  1.587454  
1     2150-06-22 18:33:54  3.025625  
2     2138-02-15 12:42:05  9.741725  
3     2123-10-25 18:59:47  0.350544  
4     2200-07-13 16:44:40  1.674769  
...                   ...       ...  
76535 2126-06-13 20:28:35  0.811516  
76536 2177-11-10 00:24:54  1.427708  
76537 2182-08-16 17:25:44  1.325127  
76538 2154-01-06 12:44:43  2.579363  
76539 2115-12-05 18:27:57  4.743715  

[76540 rows x 8 columns]


icustays_df.__class__

pandas.core.frame.DataFrame


admissions_df = pd.read_csv(mimic_path + "/core/admissions.csv.gz",
                           parse_dates = ["admittime", "dischtime", "deathtime", "edregtime", "edouttime"])
print(admissions_df)

        subject_id   hadm_id           admittime           dischtime  \
0         14679932  21038362 2139-09-26 14:16:00 2139-09-28 11:30:00   
1         15585972  24941086 2123-10-07 23:56:00 2123-10-12 11:22:00   
2         11989120  21965160 2147-01-14 09:00:00 2147-01-17 14:25:00   
3         17817079  24709883 2165-12-27 17:33:00 2165-12-31 21:18:00   
4         15078341  23272159 2122-08-28 08:48:00 2122-08-30 12:32:00   
...            ...       ...                 ...                 ...   
523735    17892964  20786062 2180-09-17 00:00:00 2180-09-18 13:37:00   
523736    17137572  20943099 2147-08-01 17:41:00 2147-08-02 17:30:00   
523737    19389857  23176714 2189-03-01 00:58:00 2189-03-02 15:22:00   
523738    12298845  22347500 2138-05-31 00:00:00 2138-06-04 16:50:00   
523739    11211939  24981356 2147-08-02 15:49:00 2147-08-05 16:30:00   

       deathtime               admission_type      admission_location  \
0            NaT                     ELECTIVE                     NaN   
1            NaT                     ELECTIVE                     NaN   
2            NaT                     ELECTIVE                     NaN   
3            NaT                     ELECTIVE                     NaN   
4            NaT                     ELECTIVE                     NaN   
...          ...                          ...                     ...   
523735       NaT  SURGICAL SAME DAY ADMISSION      PHYSICIAN REFERRAL   
523736       NaT                     EW EMER.  TRANSFER FROM HOSPITAL   
523737       NaT  SURGICAL SAME DAY ADMISSION      PHYSICIAN REFERRAL   
523738       NaT  SURGICAL SAME DAY ADMISSION      PHYSICIAN REFERRAL   
523739       NaT                     EW EMER.          EMERGENCY ROOM   

       discharge_location insurance language marital_status  \
0                    HOME     Other  ENGLISH         SINGLE   
1                    HOME     Other  ENGLISH            NaN   
2                    HOME     Other  ENGLISH            NaN   
3                    HOME     Other  ENGLISH            NaN   
4                    HOME     Other  ENGLISH            NaN   
...                   ...       ...      ...            ...   
523735               HOME  Medicare  ENGLISH         SINGLE   
523736               HOME     Other  ENGLISH       DIVORCED   
523737               HOME     Other  ENGLISH        MARRIED   
523738   HOME HEALTH CARE     Other  ENGLISH        MARRIED   
523739   HOME HEALTH CARE  Medicare        ?        WIDOWED   

                     ethnicity           edregtime           edouttime  \
0                      UNKNOWN                 NaT                 NaT   
1                        WHITE                 NaT                 NaT   
2                      UNKNOWN                 NaT                 NaT   
3                        OTHER                 NaT                 NaT   
4       BLACK/AFRICAN AMERICAN                 NaT                 NaT   
...                        ...                 ...                 ...   
523735                   WHITE                 NaT                 NaT   
523736         HISPANIC/LATINO 2147-07-31 23:55:00 2147-08-01 19:37:00   
523737                   WHITE                 NaT                 NaT   
523738                   WHITE                 NaT                 NaT   
523739                   WHITE 2147-08-02 14:27:00 2147-08-02 17:38:00   

        hospital_expire_flag  
0                          0  
1                          0  
2                          0  
3                          0  
4                          0  
...                      ...  
523735                     0  
523736                     0  
523737                     0  
523738                     0  
523739                     0  

[523740 rows x 15 columns]


patients_df = pd.read_csv(mimic_path + "/core/patients.csv.gz")
print(patients_df)

        subject_id gender  anchor_age  anchor_year anchor_year_group  dod
0         10000048      F          23         2126       2008 - 2010  NaN
1         10002723      F           0         2128       2017 - 2019  NaN
2         10003939      M           0         2184       2008 - 2010  NaN
3         10004222      M           0         2161       2014 - 2016  NaN
4         10005325      F           0         2154       2011 - 2013  NaN
...            ...    ...         ...          ...               ...  ...
382273    19998203      M          29         2132       2011 - 2013  NaN
382274    19998350      M          52         2127       2011 - 2013  NaN
382275    19999068      M          63         2161       2011 - 2013  NaN
382276    19999270      M          33         2184       2014 - 2016  NaN
382277    19999298      M          20         2177       2011 - 2013  NaN

[382278 rows x 6 columns]


from timeit import default_timer as timer


start = timer()
chartevents_df = pd.read_csv(
    mimic_path + "/icu/chartevents_filtered_itemid.csv.gz",
    usecols = ["stay_id", "itemid", "charttime", "valuenum"],
    dtype = {"stay_id" : np.float64, "itemid" : np.float64, "charttime" : "str", "valuenum" : np.float64},
    parse_dates = ["charttime"]
    )
end = timer()
print("Elapsed time: ", end - start)

Elapsed time:  10.231377217000002


print(chartevents_df)

            stay_id           charttime    itemid  valuenum
0        30600691.0 2165-04-24 05:30:00  220045.0      65.0
1        30600691.0 2165-04-24 05:38:00  223761.0      97.6
2        30600691.0 2165-04-24 06:00:00  220045.0      56.0
3        30600691.0 2165-04-24 06:09:00  220045.0      55.0
4        30600691.0 2165-04-24 07:00:00  220045.0      57.0
...             ...                 ...       ...       ...
8394026  30143796.0 2161-08-30 18:00:00  220045.0      96.0
8394027  30143796.0 2161-08-30 19:00:00  220045.0      80.0
8394028  30143796.0 2161-08-30 20:00:00  220045.0      91.0
8394029  30143796.0 2161-08-30 20:00:00  223761.0      97.5
8394030  30143796.0 2161-08-30 21:00:00  220045.0      88.0

[8394031 rows x 4 columns]


chartevents_df.query("stay_id == 30600691 and itemid == 220045")


import matplotlib.pyplot as plt
%matplotlib inline


(
    chartevents_df.query("stay_id == 30600691 and itemid == 220045").
        plot.scatter(x="charttime", y="valuenum")
)

<AxesSubplot:xlabel='charttime', ylabel='valuenum'>


icustays_df_1ststay = (icustays_df.sort_values(["subject_id", "intime"]).
                       groupby("subject_id").
                       head(1)) # head() is much faster than slice_head(n) in dplyr
print(icustays_df_1ststay)

       subject_id   hadm_id   stay_id  \
66220    10000032  29079034  39553978   
57004    10000980  26913865  39765666   
25744    10001217  24597018  37067082   
10776    10001725  25563031  31205490   
45299    10001884  26184834  37510196   
...           ...       ...       ...   
40352    19999442  26785317  32336619   
20152    19999625  25304202  31070865   
24802    19999828  25744818  36075953   
56278    19999840  21033226  38978960   
53712    19999987  23865745  36195440   

                                         first_careunit  \
66220                Medical Intensive Care Unit (MICU)   
57004                Medical Intensive Care Unit (MICU)   
25744               Surgical Intensive Care Unit (SICU)   
10776  Medical/Surgical Intensive Care Unit (MICU/SICU)   
45299                Medical Intensive Care Unit (MICU)   
...                                                 ...   
40352               Surgical Intensive Care Unit (SICU)   
20152  Medical/Surgical Intensive Care Unit (MICU/SICU)   
24802                Medical Intensive Care Unit (MICU)   
56278                               Trauma SICU (TSICU)   
53712                               Trauma SICU (TSICU)   

                                          last_careunit              intime  \
66220                Medical Intensive Care Unit (MICU) 2180-07-23 14:00:00   
57004                Medical Intensive Care Unit (MICU) 2189-06-27 08:42:00   
25744               Surgical Intensive Care Unit (SICU) 2157-11-20 19:18:02   
10776  Medical/Surgical Intensive Care Unit (MICU/SICU) 2110-04-11 15:52:22   
45299                Medical Intensive Care Unit (MICU) 2131-01-11 04:20:05   
...                                                 ...                 ...   
40352               Surgical Intensive Care Unit (SICU) 2148-11-19 14:23:43   
20152  Medical/Surgical Intensive Care Unit (MICU/SICU) 2139-10-10 19:18:00   
24802                Medical Intensive Care Unit (MICU) 2149-01-08 18:12:00   
56278               Surgical Intensive Care Unit (SICU) 2164-09-12 09:26:28   
53712                               Trauma SICU (TSICU) 2145-11-02 22:59:00   

                  outtime       los  
66220 2180-07-23 23:50:47  0.410266  
57004 2189-06-27 20:38:27  0.497535  
25744 2157-11-21 22:08:00  1.118032  
10776 2110-04-12 23:59:56  1.338588  
45299 2131-01-20 08:27:30  9.171817  
...                   ...       ...  
40352 2148-11-26 13:12:15  6.950370  
20152 2139-10-11 18:21:28  0.960741  
24802 2149-01-10 13:11:02  1.790995  
56278 2164-09-17 16:35:15  5.297766  
53712 2145-11-04 21:29:30  1.937847  

[53150 rows x 8 columns]


chartevents_df_1ststay = (
    chartevents_df.
    merge(
        icustays_df_1ststay[["stay_id", "intime", "outtime"]],
        how = "right",
        on = "stay_id"). # 15738363 rows
    query("charttime >= intime and charttime <= outtime"). # 15700234 rows
    sort_values(["stay_id", "itemid", "charttime"]).
    groupby(["stay_id", "itemid"]).
    head(1). # 263332 rows
    drop(["charttime", "intime", "outtime"], axis="columns"). # remove unnecessary columns
    astype({"itemid": str}). # change it to string for easier renaming
    pivot(index="stay_id", columns="itemid", values="valuenum").
    rename(columns={"220045.0": "heart_rate",
            "223761.0": "temp_f"})
)


chartevents_df_1ststay


mimic_icu_cohort = (
    icustays_df_1ststay.
    # merge dataframes
    merge(admissions_df, on=["subject_id", "hadm_id"], how="left").
    merge(patients_df, on=["subject_id"], how="left").
    merge(chartevents_df_1ststay, on=["stay_id"], how="left").
    # age_intime is the age at the ICU stay intime
    assign(age_intime = lambda df: 
           df["anchor_age"] + df["intime"].map(lambda x : x.year) - df["anchor_year"]).
    # whether the patient died within 30 days of ICU stay intime
    assign(hadm_to_death = lambda df: 
           np.where(np.isnan(df["deathtime"]), 
                    np.inf, 
                    (df["deathtime"] - df["intime"]).dt.total_seconds())).
    assign(thirty_day_mort = lambda df: df["hadm_to_death"] <= 2592000)   
)


mimic_icu_cohort


(
    mimic_icu_cohort[["first_careunit", 
        "gender", 
        "ethnicity", 
        "age_intime", 
        "heart_rate", 
        "temp_f"]].
    describe(include="all")
)


mimic_icu_cohort["first_careunit"].value_counts()

Medical Intensive Care Unit (MICU)                  10233
Cardiac Vascular Intensive Care Unit (CVICU)         9450
Medical/Surgical Intensive Care Unit (MICU/SICU)     8833
Surgical Intensive Care Unit (SICU)                  8249
Trauma SICU (TSICU)                                  6965
Coronary Care Unit (CCU)                             6100
Neuro Surgical Intensive Care Unit (Neuro SICU)      1381
Neuro Intermediate                                   1312
Neuro Stepdown                                        627
Name: first_careunit, dtype: int64


mimic_icu_cohort["gender"].value_counts()

M    29797
F    23353
Name: gender, dtype: int64


mimic_icu_cohort["ethnicity"].value_counts()

WHITE                            35668
UNKNOWN                           5827
BLACK/AFRICAN AMERICAN            4874
OTHER                             2537
HISPANIC/LATINO                   1827
ASIAN                             1564
UNABLE TO OBTAIN                   759
AMERICAN INDIAN/ALASKA NATIVE       94
Name: ethnicity, dtype: int64


import seaborn as sns
sns.set()


mimic_icu_cohort["first_careunit"].value_counts(sort=False).plot.bar()

<AxesSubplot:>


mimic_icu_cohort["age_intime"].plot.hist()

<AxesSubplot:ylabel='Frequency'>


mimic_icu_cohort["age_intime"].plot.box()

<AxesSubplot:>


(mimic_icu_cohort.
     groupby("first_careunit")["thirty_day_mort"].value_counts().
     unstack("thirty_day_mort").iloc[:, ::-1]. # reversing column order to make True come first
 plot.bar(stacked=True)
)

<AxesSubplot:xlabel='first_careunit'>


(mimic_icu_cohort.
     groupby("first_careunit")["thirty_day_mort"].value_counts(normalize=True).
     unstack("thirty_day_mort").iloc[:, ::-1].
     plot.bar(stacked=True)
)

<AxesSubplot:xlabel='first_careunit'>


(mimic_icu_cohort.
     groupby("gender")["thirty_day_mort"].value_counts(normalize=False).
     unstack("thirty_day_mort").iloc[:, ::-1].
     plot.bar(stacked=True)
)

<AxesSubplot:xlabel='gender'>


(mimic_icu_cohort.
     groupby("gender")["thirty_day_mort"].value_counts(normalize=True).
     unstack("thirty_day_mort").iloc[:, ::-1].
     plot.bar(stacked=True)
)

<AxesSubplot:xlabel='gender'>


mimic_icu_cohort.groupby("gender")["thirty_day_mort"].value_counts(normalize=True).unstack("thirty_day_mort")

Package	Description	Logo
Numpy	Numerical arrays
Scipy	User-friendly and efficient numerical routines: numerical integration, interpolation, optimization, linear algebra, and statistics
Jupyter Notebook	Interactive programming environment
Matplotlib	Plotting
Pandas	Data analytics R-like data frames
Scikit-learn	Machine learning

itemid	heart_rate	temp_f
stay_id
30000153.0	104.0	99.1
30000646.0	100.0	98.8
30001148.0	80.0	95.6
30001336.0	65.0	98.5
30001396.0	86.0	98.8
...	...	...
39999286.0	125.0	98.9
39999384.0	77.0	98.2
39999552.0	66.0	98.2
39999562.0	93.0	98.4
39999810.0	71.0	98.1

Python for Data Science¶

Why Python?¶

Scientific Python (SciPy) Ecosystem¶

Before We move on...¶

Pandas¶

Loading data¶

Note: Object-Oriented Programming¶

Note: Method Chaining¶

Target cohort (from R section)¶

Wrangling and merging data frames¶

Step 1: restrict to the first ICU stay of each patient¶

Step 2: restrict to the first vital measurements during the ICU stay¶

Step 3: merge DataFrames¶

Data visualization¶

Univariate summaries¶

Exercises¶

Bivariate summaries¶

Exercises¶

Pros and Cons of Python¶

	stay_id	charttime	itemid	valuenum
0	30600691.0	2165-04-24 05:30:00	220045.0	65.0
2	30600691.0	2165-04-24 06:00:00	220045.0	56.0
3	30600691.0	2165-04-24 06:09:00	220045.0	55.0
4	30600691.0	2165-04-24 07:00:00	220045.0	57.0
6	30600691.0	2165-04-24 08:00:00	220045.0	56.0

	subject_id	hadm_id	stay_id	first_careunit	last_careunit	intime	outtime	los	admittime	dischtime	...	gender	anchor_age	anchor_year	anchor_year_group	dod	heart_rate	temp_f	age_intime	hadm_to_death	thirty_day_mort
0	10000032	29079034	39553978	Medical Intensive Care Unit (MICU)	Medical Intensive Care Unit (MICU)	2180-07-23 14:00:00	2180-07-23 23:50:47	0.410266	2180-07-23 12:35:00	2180-07-25 17:55:00	...	F	52	2180	2014 - 2016	NaN	91.0	98.7	52	inf	False
1	10000980	26913865	39765666	Medical Intensive Care Unit (MICU)	Medical Intensive Care Unit (MICU)	2189-06-27 08:42:00	2189-06-27 20:38:27	0.497535	2189-06-27 07:38:00	2189-07-03 03:00:00	...	F	73	2186	2008 - 2010	NaN	77.0	98.0	76	inf	False
2	10001217	24597018	37067082	Surgical Intensive Care Unit (SICU)	Surgical Intensive Care Unit (SICU)	2157-11-20 19:18:02	2157-11-21 22:08:00	1.118032	2157-11-18 22:56:00	2157-11-25 18:00:00	...	F	55	2157	2011 - 2013	NaN	86.0	98.5	55	inf	False
3	10001725	25563031	31205490	Medical/Surgical Intensive Care Unit (MICU/SICU)	Medical/Surgical Intensive Care Unit (MICU/SICU)	2110-04-11 15:52:22	2110-04-12 23:59:56	1.338588	2110-04-11 15:08:00	2110-04-14 15:00:00	...	F	46	2110	2011 - 2013	NaN	55.0	97.7	46	inf	False
4	10001884	26184834	37510196	Medical Intensive Care Unit (MICU)	Medical Intensive Care Unit (MICU)	2131-01-11 04:20:05	2131-01-20 08:27:30	9.171817	2131-01-07 20:39:00	2131-01-20 05:15:00	...	F	68	2122	2008 - 2010	2131-01-20	38.0	98.1	77	780895.0	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
53145	19999442	26785317	32336619	Surgical Intensive Care Unit (SICU)	Surgical Intensive Care Unit (SICU)	2148-11-19 14:23:43	2148-11-26 13:12:15	6.950370	2148-11-19 10:00:00	2148-12-04 16:25:00	...	M	41	2146	2008 - 2010	NaN	88.0	98.3	43	inf	False
53146	19999625	25304202	31070865	Medical/Surgical Intensive Care Unit (MICU/SICU)	Medical/Surgical Intensive Care Unit (MICU/SICU)	2139-10-10 19:18:00	2139-10-11 18:21:28	0.960741	2139-10-10 18:06:00	2139-10-16 03:30:00	...	M	81	2138	2008 - 2010	NaN	96.0	98.9	82	inf	False
53147	19999828	25744818	36075953	Medical Intensive Care Unit (MICU)	Medical Intensive Care Unit (MICU)	2149-01-08 18:12:00	2149-01-10 13:11:02	1.790995	2149-01-08 16:44:00	2149-01-18 17:00:00	...	F	46	2147	2017 - 2019	NaN	104.0	98.7	48	inf	False
53148	19999840	21033226	38978960	Trauma SICU (TSICU)	Surgical Intensive Care Unit (SICU)	2164-09-12 09:26:28	2164-09-17 16:35:15	5.297766	2164-09-10 13:47:00	2164-09-17 13:42:00	...	M	58	2164	2008 - 2010	2164-09-17	100.0	99.3	58	447332.0	True
53149	19999987	23865745	36195440	Trauma SICU (TSICU)	Trauma SICU (TSICU)	2145-11-02 22:59:00	2145-11-04 21:29:30	1.937847	2145-11-02 21:38:00	2145-11-11 12:57:00	...	F	57	2145	2011 - 2013	NaN	94.0	99.6	57	inf	False

	first_careunit	gender	ethnicity	age_intime	heart_rate	temp_f
count	53150	53150	53150	53150.000000	53135.000000	52196.000000
unique	9	2	8	NaN	NaN	NaN
top	Medical Intensive Care Unit (MICU)	M	WHITE	NaN	NaN	NaN
freq	10233	29797	35668	NaN	NaN	NaN
mean	NaN	NaN	NaN	64.470461	87.466660	98.034339
std	NaN	NaN	NaN	17.315514	20.170132	3.345850
min	NaN	NaN	NaN	18.000000	0.000000	0.000000
25%	NaN	NaN	NaN	54.000000	74.000000	97.600000
50%	NaN	NaN	NaN	66.000000	85.000000	98.100000
75%	NaN	NaN	NaN	78.000000	99.000000	98.700000
max	NaN	NaN	NaN	102.000000	941.000000	106.000000

thirty_day_mort	False	True
gender
F	0.895088	0.104912
M	0.904420	0.095580