Skip to content

Measures

anxiety

combine_anxiety(df, drop_duplicates=True, merge=True)

Combines the subscales for Anxiety (0-6) and Anxiety (6-18)

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
drop_duplicates Optional[bool]

Option to drop duplicates keeping the newest first. Defaults to True.

True
merge Optional[bool]

Option to merge results to the original DataFrame. Defaults to True.

True

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame

Source code in pondtools\measures\anxiety.py
def combine_anxiety(
    df: pd.DataFrame, 
    drop_duplicates: Optional[bool] = True,
    merge: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Combines the subscales for Anxiety (0-6) and Anxiety (6-18)

    Args:
        df (pd.DataFrame): DataFrame
        drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
        merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.

    Returns:
        pd.DataFrame: DataFrame
    """
    df_anxiety = combine_measures(
        df=df,
        df_map=DF_ANXIETY_SUBSCALES,
        index_col="SUBJECT",
        measure_prefix="ANXIETY",
        sort_col="ANXIETY_DATE",
        drop_duplicates=drop_duplicates,
    )
    if merge:
        df_anxiety.drop(columns=["SUBJECT"], inplace=True)
        return df.merge(df_anxiety, how="left", left_index=True, right_index=True)
    else:
        return df_anxiety

cbcl

combine_cbcl(df, drop_duplicates=True, merge=True)

Combines the subscales for CBCL (0-6) and CBCL (6-18)

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
drop_duplicates Optional[bool]

Option to drop duplicates keeping the newest first. Defaults to True.

True
merge Optional[bool]

Option to merge results to the original DataFrame. Defaults to True.

True

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame

Source code in pondtools\measures\cbcl.py
def combine_cbcl(
    df: pd.DataFrame, 
    drop_duplicates: Optional[bool] = True,
    merge: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Combines the subscales for CBCL (0-6) and CBCL (6-18)

    Args:
        df (pd.DataFrame): DataFrame
        drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
        merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.

    Returns:
        pd.DataFrame: DataFrame
    """
    df_cbcl = combine_measures(
        df=df,
        df_map=DF_CBCL_SUBSCALES,
        index_col="SUBJECT",
        measure_prefix="CBCL",
        sort_col="CBCL_DATE",
        drop_duplicates=drop_duplicates,
    )
    if merge:
        df_cbcl.drop(columns=["SUBJECT"], inplace=True)
        return df.merge(df_cbcl, how="left", left_index=True, right_index=True)
    else:
        return df_cbcl

demographics

add_combine_cargiver_columns(df)

Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with combined caregiver columns

Source code in pondtools\measures\demographics.py
def add_combine_cargiver_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.

    Args:
        df (pd.DataFrame): DataFrame
    Returns:
        pd.DataFrame: DataFrame with combined caregiver columns
    """
    if set(CAREGIVER_COLS).issubset(df.columns):
        df = df.copy()
        for prefix in CAREGIVER_PREFIXES:
            for stat, fn in COMBINATION_STATS_DICT.items():
                cols = [f'{prefix}_1_STD', f'{prefix}_2_STD']
                df_filter = df.copy().filter_columns(col=cols, threshold=9000, comparison='>', replacement=np.nan)
                df[f'{prefix}_{stat}'] = df_filter[cols].apply(func=fn, axis=1)

        return df
    else:
        print("Missing columns required to calculate caregiver columns")

add_diagnoses_columns(df)

Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with diagnoses columns added

Source code in pondtools\measures\demographics.py
def add_diagnoses_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses

    Args:
        df (pd.DataFrame):  DataFrame

    Returns:
        pd.DataFrame: DataFrame with diagnoses columns added
    """
    if set(CLINICAL_DX_COLS + ['PRIMARY_DIAGNOSIS']).issubset(df.columns):
        df = df.copy()
        df["ASD_DIAGNOSIS"] = df.apply(
            lambda row: 1
            if row["PRIMARY_DIAGNOSIS"] == "ASD" or row["CDCASDDX"] == 1
            else 0,
            axis=1,
        )
        df["ADHD_DIAGNOSIS"] = df.apply(
            lambda row: 1
            if row["PRIMARY_DIAGNOSIS"] == "ADHD" or row["CDCADHDX"] == 1
            else 0,
            axis=1,
        )
        df["OCD_DIAGNOSIS"] = df.apply(
            lambda row: 1
            if row["PRIMARY_DIAGNOSIS"] == "OCD" or row["CDCOCDDX"] == 1
            else 0,
            axis=1,
        )
        df["TD_DIAGNOSIS"] = df.apply(
            lambda row: 1 if row["PRIMARY_DIAGNOSIS"] == "Typically Developing" else 0,
            axis=1,
        )

        # Comorbid Diagnoses

        df["COMORBID_DIAGNOSIS"] = np.nan
        df.loc[
            df.query(
                "ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == True"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "ASD_ADHD_OCD"

        df.loc[
            df.query(
                "ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == False"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "ASD_ADHD"

        df.loc[
            df.query(
                "ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == False & OCD_DIAGNOSIS == True"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "ASD_OCD"

        df.loc[
            df.query(
                "ASD_DIAGNOSIS == False & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == True"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "ADHD_OCD"

        df.loc[
            df.query(
                "ASD_DIAGNOSIS == True & ADHD_DIAGNOSIS == False & OCD_DIAGNOSIS == False"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "ASD"

        df.loc[
            df.query(
                "ASD_DIAGNOSIS == False & ADHD_DIAGNOSIS == True & OCD_DIAGNOSIS == False"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "ADHD"

        df.loc[
            df.query(
                "ASD_DIAGNOSIS == False & ADHD_DIAGNOSIS == False & OCD_DIAGNOSIS == True"
            ).index,
            "COMORBID_DIAGNOSIS",
        ] = "OCD"

        df.loc[
            df.query("TD_DIAGNOSIS == True").index, "COMORBID_DIAGNOSIS"
        ] = "Typically Developing"

        return df
    else:
        print("Missing columns required to calculate diagnoses columns")

geocode

add_geocode_columns(df)

Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required

Returns:

Type Description
[type]

[description]

Source code in pondtools\measures\geocode.py
def add_geocode_columns(df: pd.DataFrame):
    """
    Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.

    Args:
        df (pd.DataFrame): DataFrame

    Returns:
        [type]: [description]
    """
    if 'POSTAL_CODE' in df.columns:
        df = df.copy()
        # Get location based data
        df[["PLACE_NAME", "LATITUDE", "LONGITUDE"]] = df["POSTAL_CODE"].apply(
            lambda x: get_geo_info(x)
        )

        df["PHU_NAME_E"] = df.get_geo_match(
            geojson=ONTARIO_GEO,
            match_col="PHU_NAME_E",
            lat_col="LATITUDE",
            lng_col="LONGITUDE",
        )

        df["GEO_CODE"] = df["PHU_NAME_E"].replace(PHU_MAPPING_DICT)

        # Get census data
        df = (
            df.reset_index()
            .merge(get_census(), how="left", left_on="GEO_CODE", right_on="GEO_CODE")
            .set_index("index")
        )
        return df
    else:
        print("Missing columns required to calculate geocode subdomains")

get_census(filepath=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/data/2016-Census-HealthRegion.csv'))

Reads census data and extracts the relevant columns

Parameters:

Name Type Description Default
filepath str

Filepath of census data.

WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/data/2016-Census-HealthRegion.csv')

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with relevant census data

Source code in pondtools\measures\geocode.py
def get_census(filepath: str = CENSUS) -> pd.DataFrame:
    """
    Reads census data and extracts the relevant columns

    Args:
        filepath (str, optional): Filepath of census data.

    Returns:
        pd.DataFrame: DataFrame with relevant census data
    """
    census = pd.read_csv(filepath)
    census_col = list(CENSUS_COL_DICT.keys())
    census_col
    census_search = census.query(
        f"MEASURE.isin(@census_col)& GEO_LEVEL == 2 & GEO_CODE>=3000 & GEO_CODE <4000",
        engine="python",
    )

    census_results = census_search.pivot(
        index="GEO_CODE",
        columns="MEASURE",
        values="TOTAL_VALUE",
    ).rename(columns=CENSUS_COL_DICT)
    census_results = census_results.astype("float64")
    return census_results

iq

combine_iq(df, drop_duplicates=True, merge=True)

Combines the subscales for different IQ Measures

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
drop_duplicates Optional[bool]

Option to drop duplicates keeping the newest first. Defaults to True.

True
merge Optional[bool]

Option to merge results to the original DataFrame. Defaults to True.

True

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame

Source code in pondtools\measures\iq.py
def combine_iq(
    df: pd.DataFrame, 
    drop_duplicates: Optional[bool] = True,
    merge: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Combines the subscales for different IQ Measures

    Args:
        df (pd.DataFrame): DataFrame
        drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
        merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.

    Returns:
        pd.DataFrame: DataFrame
    """
    df_iq = combine_measures(
        df=df,
        df_map=DF_IQ_SUBSCALES,
        index_col="SUBJECT",
        measure_prefix="IQ",
        sort_col="IQ_DATE",
        drop_duplicates=False,
    )

    df_iq['Null_Count'] = df_iq[['FULL_IQ']].isnull().sum(axis=1)
    df_iq = df_iq.sort_values(['SUBJECT','Null_Count', 'IQ_DATE',])


    if drop_duplicates:
        # Keeps latest instance of IQ test with least non-null values   
        df_iq = df_iq.drop_duplicates(subset=['SUBJECT'], keep='first')

    df_iq.drop(columns=['Null_Count'], inplace=True)

    if merge:
        df_iq.drop(columns=["SUBJECT"], inplace=True)
        return df.merge(df_iq, how="left", left_index=True, right_index=True)
    else:
        return df_iq

language

combine_language(df, drop_duplicates=True, merge=True)

Combines the subscales for Language (OWLS, PLS)

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
drop_duplicates Optional[bool]

Option to drop duplicates keeping the newest first. Defaults to True.

True
merge Optional[bool]

Option to merge results to the original DataFrame. Defaults to True.

True

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame

Source code in pondtools\measures\language.py
def combine_language(
    df: pd.DataFrame, 
    drop_duplicates: Optional[bool] = True,
    merge: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Combines the subscales for Language (OWLS, PLS)

    Args:
        df (pd.DataFrame): DataFrame
        drop_duplicates (Optional[bool], optional): Option to drop duplicates keeping the newest first. Defaults to True.
        merge (Optional[bool], optional): Option to merge results to the original DataFrame. Defaults to True.

    Returns:
        pd.DataFrame: DataFrame
    """
    df_language = combine_measures(
        df=df,
        df_map=DF_LANGUAGE_SUBSCALES,
        index_col="SUBJECT",
        measure_prefix="LANGUAGE",
        sort_col="LANGUAGE_DATE",
        drop_duplicates=drop_duplicates,
    )
    if merge:
        df_language.drop(columns=["SUBJECT"], inplace=True)
        return df.merge(df_language, how="left", left_index=True, right_index=True)
    else:
        return df_language

rbs

add_rbs_subscales(df)

Adds RBS subscales RSM(Repetitive Sensory Motor) and IS(Insistence on Sameness) from McDermott 2020

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with RBS subscales added

Source code in pondtools\measures\rbs.py
def add_rbs_subscales(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds RBS subscales RSM(Repetitive Sensory Motor) and IS(Insistence on Sameness) from McDermott 2020

    Args:
        df (pd.DataFrame): DataFrame

    Returns:
        pd.DataFrame: DataFrame with RBS subscales added
    """
    df = df.copy()
    if set(f"RBS{i}_STD" for i in range(1, 44, 1)).issubset(df.columns):
        df["RBS_RSM"] = df[RBS_RSM].sum(axis=1)
        df["RBS_IS"] = df[RBS_IS].sum(axis=1)

    else:
        print("Missing columns required to calculate SCQ subdomains")
    return df

scq

add_scq_subdomains(df)

Add subdomain columns, social, communication, rrb, and social communication subdomain columns to SCQ. The subdomains are not validated and based on mappings to the ADI-R subdomains.

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with SCQ Subdomain columns added

Source code in pondtools\measures\scq.py
def add_scq_subdomains(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add subdomain columns, social, communication, rrb, and social communication subdomain columns to SCQ. The subdomains are not validated and based on mappings to the ADI-R subdomains.

    Args:
        df (pd.DataFrame): DataFrame

    Returns:
        pd.DataFrame: DataFrame with SCQ Subdomain columns added
    """
    df = df.copy()
    if set(f"SCQ{i}_STD" for i in range(1, 41, 1)).issubset(df.columns):
        df["SCQ_SOCIAL_DOMAIN"] = df[SCQ_SOCIAL_DOMAIN].sum(axis=1)
        df["SCQ_COMMUNICATION_DOMAIN"] = df[SCQ_COMMUNICATION_DOMAIN].sum(axis=1)
        df["SCQ_RRB_DOMAIN"] = df[SCQ_RRB_DOMAIN].sum(axis=1)
        df["SCQ_SC_DOMAIN"] = df[SCQ_SC_DOMAIN].sum(axis=1)
    else:
        print("Missing columns required to calculate SCQ subdomains")
    return df