Measures
anxiety
combine_anxiety(df, drop_duplicates=True, merge=True)
Combines the subscales for Anxiety (0-6) and Anxiety (6-18)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
drop_duplicates |
Optional[bool] |
Option to drop duplicates keeping the newest first. Defaults to True. |
True |
merge |
Optional[bool] |
Option to merge results to the original DataFrame. Defaults to True. |
True |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame |
Source code in pondtools\measures\anxiety.py
def combine_anxiety(
df: pd.DataFrame,
drop_duplicates: Optional[bool] = True,
merge: Optional[bool] = True,
) -> pd.DataFrame:
"""
Combines the subscales for Anxiety (0-6) and Anxiety (6-18)
Args:
df (pd.DataFrame): DataFrame
drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.
Returns:
pd.DataFrame: DataFrame
"""
df_anxiety = combine_measures(
df=df,
df_map=DF_ANXIETY_SUBSCALES,
index_col="SUBJECT",
measure_prefix="ANXIETY",
sort_col="ANXIETY_DATE",
drop_duplicates=drop_duplicates,
)
if merge:
df_anxiety.drop(columns=["SUBJECT"], inplace=True)
return df.merge(df_anxiety, how="left", left_index=True, right_index=True)
else:
return df_anxiety
cbcl
combine_cbcl(df, drop_duplicates=True, merge=True)
Combines the subscales for CBCL (0-6) and CBCL (6-18)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
drop_duplicates |
Optional[bool] |
Option to drop duplicates keeping the newest first. Defaults to True. |
True |
merge |
Optional[bool] |
Option to merge results to the original DataFrame. Defaults to True. |
True |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame |
Source code in pondtools\measures\cbcl.py
def combine_cbcl(
df: pd.DataFrame,
drop_duplicates: Optional[bool] = True,
merge: Optional[bool] = True,
) -> pd.DataFrame:
"""
Combines the subscales for CBCL (0-6) and CBCL (6-18)
Args:
df (pd.DataFrame): DataFrame
drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.
Returns:
pd.DataFrame: DataFrame
"""
df_cbcl = combine_measures(
df=df,
df_map=DF_CBCL_SUBSCALES,
index_col="SUBJECT",
measure_prefix="CBCL",
sort_col="CBCL_DATE",
drop_duplicates=drop_duplicates,
)
if merge:
df_cbcl.drop(columns=["SUBJECT"], inplace=True)
return df.merge(df_cbcl, how="left", left_index=True, right_index=True)
else:
return df_cbcl
demographics
add_combine_cargiver_columns(df)
Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame with combined caregiver columns |
Source code in pondtools\measures\demographics.py
def add_combine_cargiver_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.
Args:
df (pd.DataFrame): DataFrame
Returns:
pd.DataFrame: DataFrame with combined caregiver columns
"""
if set(CAREGIVER_COLS).issubset(df.columns):
df = df.copy()
for prefix in CAREGIVER_PREFIXES:
for stat, fn in COMBINATION_STATS_DICT.items():
cols = [f'{prefix}_1_STD', f'{prefix}_2_STD']
df_filter = df.copy().filter_columns(col=cols, threshold=9000, comparison='>', replacement=np.nan)
df[f'{prefix}_{stat}'] = df_filter[cols].apply(func=fn, axis=1)
return df
else:
print("Missing columns required to calculate caregiver columns")
add_diagnoses_columns(df)
Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame with diagnoses columns added |
Source code in pondtools\measures\demographics.py
def add_diagnoses_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses
Args:
df (pd.DataFrame): DataFrame
Returns:
pd.DataFrame: DataFrame with diagnoses columns added
"""
if set(CLINICAL_DX_COLS + ['PRIMARY_DIAGNOSIS']).issubset(df.columns):
df = df.copy()
df["ASD_DIAGNOSIS"] = df.apply(
lambda row: 1
if row["PRIMARY_DIAGNOSIS"] == "ASD" or row["CDCASDDX"] == 1
else 0,
axis=1,
)
df["ADHD_DIAGNOSIS"] = df.apply(
lambda row: 1
if row["PRIMARY_DIAGNOSIS"] == "ADHD" or row["CDCADHDX"] == 1
else 0,
axis=1,
)
df["OCD_DIAGNOSIS"] = df.apply(
lambda row: 1
if row["PRIMARY_DIAGNOSIS"] == "OCD" or row["CDCOCDDX"] == 1
else 0,
axis=1,
)
df["TD_DIAGNOSIS"] = df.apply(
lambda row: 1 if row["PRIMARY_DIAGNOSIS"] == "Typically Developing" else 0,
axis=1,
)
# Comorbid Diagnoses
df["COMORBID_DIAGNOSIS"] = np.nan
df.loc[
df.query(
"ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == True"
).index,
"COMORBID_DIAGNOSIS",
] = "ASD_ADHD_OCD"
df.loc[
df.query(
"ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == False"
).index,
"COMORBID_DIAGNOSIS",
] = "ASD_ADHD"
df.loc[
df.query(
"ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == False & OCD_DIAGNOSIS == True"
).index,
"COMORBID_DIAGNOSIS",
] = "ASD_OCD"
df.loc[
df.query(
"ASD_DIAGNOSIS == False & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == True"
).index,
"COMORBID_DIAGNOSIS",
] = "ADHD_OCD"
df.loc[
df.query(
"ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == False & OCD_DIAGNOSIS == False"
).index,
"COMORBID_DIAGNOSIS",
] = "ASD"
df.loc[
df.query(
"ASD_DIAGNOSIS == False & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == False"
).index,
"COMORBID_DIAGNOSIS",
] = "ADHD"
df.loc[
df.query(
"ASD_DIAGNOSIS == False & ADHD_DIAGNOSIS == False & OCD_DIAGNOSIS == True"
).index,
"COMORBID_DIAGNOSIS",
] = "OCD"
df.loc[
df.query("TD_DIAGNOSIS == True").index, "COMORBID_DIAGNOSIS"
] = "Typically Developing"
return df
else:
print("Missing columns required to calculate diagnoses columns")
geocode
add_geocode_columns(df)
Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
Returns:
Type | Description |
---|---|
[type] |
[description] |
Source code in pondtools\measures\geocode.py
def add_geocode_columns(df: pd.DataFrame):
"""
Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.
Args:
df (pd.DataFrame): DataFrame
Returns:
[type]: [description]
"""
if 'POSTAL_CODE' in df.columns:
df = df.copy()
# Get location based data
df[["PLACE_NAME", "LATITUDE", "LONGITUDE"]] = df["POSTAL_CODE"].apply(
lambda x: get_geo_info(x)
)
df["PHU_NAME_E"] = df.get_geo_match(
geojson=ONTARIO_GEO,
match_col="PHU_NAME_E",
lat_col="LATITUDE",
lng_col="LONGITUDE",
)
df["GEO_CODE"] = df["PHU_NAME_E"].replace(PHU_MAPPING_DICT)
# Get census data
df = (
df.reset_index()
.merge(get_census(), how="left", left_on="GEO_CODE", right_on="GEO_CODE")
.set_index("index")
)
return df
else:
print("Missing columns required to calculate geocode subdomains")
get_census(filepath=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/data/2016-Census-HealthRegion.csv'))
Reads census data and extracts the relevant columns
Parameters:
Name | Type | Description | Default |
---|---|---|---|
filepath |
str |
Filepath of census data. |
WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/data/2016-Census-HealthRegion.csv') |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame with relevant census data |
Source code in pondtools\measures\geocode.py
def get_census(filepath: str = CENSUS) -> pd.DataFrame:
"""
Reads census data and extracts the relevant columns
Args:
filepath (str, optional): Filepath of census data.
Returns:
pd.DataFrame: DataFrame with relevant census data
"""
census = pd.read_csv(filepath)
census_col = list(CENSUS_COL_DICT.keys())
census_col
census_search = census.query(
f"MEASURE.isin(@census_col)& GEO_LEVEL == 2 & GEO_CODE>=3000 & GEO_CODE <4000",
engine="python",
)
census_results = census_search.pivot(
index="GEO_CODE",
columns="MEASURE",
values="TOTAL_VALUE",
).rename(columns=CENSUS_COL_DICT)
census_results = census_results.astype("float64")
return census_results
iq
combine_iq(df, drop_duplicates=True, merge=True)
Combines the subscales for different IQ Measures
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
drop_duplicates |
Optional[bool] |
Option to drop duplicates keeping the newest first. Defaults to True. |
True |
merge |
Optional[bool] |
Option to merge results to the original DataFrame. Defaults to True. |
True |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame |
Source code in pondtools\measures\iq.py
def combine_iq(
df: pd.DataFrame,
drop_duplicates: Optional[bool] = True,
merge: Optional[bool] = True,
) -> pd.DataFrame:
"""
Combines the subscales for different IQ Measures
Args:
df (pd.DataFrame): DataFrame
drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.
Returns:
pd.DataFrame: DataFrame
"""
df_iq = combine_measures(
df=df,
df_map=DF_IQ_SUBSCALES,
index_col="SUBJECT",
measure_prefix="IQ",
sort_col="IQ_DATE",
drop_duplicates=False,
)
df_iq['Null_Count'] = df_iq[['FULL_IQ']].isnull().sum(axis=1)
df_iq = df_iq.sort_values(['SUBJECT','Null_Count', 'IQ_DATE',])
if drop_duplicates:
# Keeps latest instance of IQ test with least non-null values
df_iq = df_iq.drop_duplicates(subset=['SUBJECT'], keep='first')
df_iq.drop(columns=['Null_Count'], inplace=True)
if merge:
df_iq.drop(columns=["SUBJECT"], inplace=True)
return df.merge(df_iq, how="left", left_index=True, right_index=True)
else:
return df_iq
language
combine_language(df, drop_duplicates=True, merge=True)
Combines the subscales for Language (OWLS, PLS)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
drop_duplicates |
Optional[bool] |
Option to drop duplicates keeping the newest first. Defaults to True. |
True |
merge |
Optional[bool] |
Option to merge results to the original DataFrame. Defaults to True. |
True |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame |
Source code in pondtools\measures\language.py
def combine_language(
df: pd.DataFrame,
drop_duplicates: Optional[bool] = True,
merge: Optional[bool] = True,
) -> pd.DataFrame:
"""
Combines the subscales for Language (OWLS, PLS)
Args:
df (pd.DataFrame): DataFrame
drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.
Returns:
pd.DataFrame: DataFrame
"""
df_language = combine_measures(
df=df,
df_map=DF_LANGUAGE_SUBSCALES,
index_col="SUBJECT",
measure_prefix="LANGUAGE",
sort_col="LANGUAGE_DATE",
drop_duplicates=drop_duplicates,
)
if merge:
df_language.drop(columns=["SUBJECT"], inplace=True)
return df.merge(df_language, how="left", left_index=True, right_index=True)
else:
return df_language
rbs
add_rbs_subscales(df)
Adds RBS subscales RSM(Repetitive Sensory Motor) and IS(Insistence on Sameness) from McDermott 2020
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame with RBS subscales added |
Source code in pondtools\measures\rbs.py
def add_rbs_subscales(df: pd.DataFrame) -> pd.DataFrame:
"""
Adds RBS subscales RSM(Repetitive Sensory Motor) and IS(Insistence on Sameness) from McDermott 2020
Args:
df (pd.DataFrame): DataFrame
Returns:
pd.DataFrame: DataFrame with RBS subscales added
"""
df = df.copy()
if set(f"RBS{i}_STD" for i in range(1, 44, 1)).issubset(df.columns):
df["RBS_RSM"] = df[RBS_RSM].sum(axis=1)
df["RBS_IS"] = df[RBS_IS].sum(axis=1)
else:
print("Missing columns required to calculate SCQ subdomains")
return df
scq
add_scq_subdomains(df)
Add subdomain columns, social, communication, rrb, and social communication subdomain columns to SCQ. The subdomains are not validated and based on mappings to the ADI-R subdomains.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame with SCQ Subdomain columns added |
Source code in pondtools\measures\scq.py
def add_scq_subdomains(df: pd.DataFrame) -> pd.DataFrame:
"""
Add subdomain columns, social, communication, rrb, and social communication subdomain columns to SCQ. The subdomains are not validated and based on mappings to the ADI-R subdomains.
Args:
df (pd.DataFrame): DataFrame
Returns:
pd.DataFrame: DataFrame with SCQ Subdomain columns added
"""
df = df.copy()
if set(f"SCQ{i}_STD" for i in range(1, 41, 1)).issubset(df.columns):
df["SCQ_SOCIAL_DOMAIN"] = df[SCQ_SOCIAL_DOMAIN].sum(axis=1)
df["SCQ_COMMUNICATION_DOMAIN"] = df[SCQ_COMMUNICATION_DOMAIN].sum(axis=1)
df["SCQ_RRB_DOMAIN"] = df[SCQ_RRB_DOMAIN].sum(axis=1)
df["SCQ_SC_DOMAIN"] = df[SCQ_SC_DOMAIN].sum(axis=1)
else:
print("Missing columns required to calculate SCQ subdomains")
return df