Skip to content

Load

drop_duplicate_ids(df, id_col='SUBJECT', sort_col=None, ascending=True)

Drops duplicate ids by a set of criteria

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
id_col Optional[str]

Name of ID column. Defaults to "SUBJECT".

'SUBJECT'
sort_col Optional[str]

Name of column used to sort values. Defaults to None. If None, it will look at date columns with differences and pick the oldest or newest value.

None
ascending Optional[bool]

Option to sort in ascending order (or descending if False). Defaults to True.

True

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame where duplicated IDs are dropped/

Source code in pondtools\load.py
@pf.register_dataframe_method
def drop_duplicate_ids(
    df: pd.DataFrame,
    id_col: Optional[str] = "SUBJECT",
    sort_col: Optional[str] = None,
    ascending: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Drops duplicate ids by a set of criteria

    Args:
        df (pd.DataFrame): DataFrame
        id_col (Optional[str], optional): Name of ID column. Defaults to "SUBJECT".
        sort_col (Optional[str], optional): Name of column used to sort values. Defaults to None. If None, it will look at date columns with differences and pick the oldest or newest value.
        ascending (Optional[bool], optional): Option to sort in ascending order (or descending if False). Defaults to True.

    Returns:
        pd.DataFrame: DataFrame where duplicated IDs are dropped/
    """
    df = df.copy().drop_duplicates()
    vc = df[id_col].value_counts()
    duplicate_subjects = (vc[vc > 1]).index.to_list()

    if sort_col != None:
        return df.sort_values(sort_col, ascending=ascending).drop_duplicates(subset=[id_col], keep='first')

    else:
        indexes_to_drop = []
        for subject in duplicate_subjects:
            df_subject = (
                df.query(f"{id_col} == {subject}")
                .get_duplicate_columns(duplicate=False)
            )

            df_subject_dates = df_subject.search_columns(["date", "doc", "instance"])
            indexes_original = set(df_subject.index.to_list())
            # If no different date columns, keep instance with the least nulls
            if  df_subject_dates.shape[0]== 0 or df_subject_dates.shape[1]== 0: 
                keep = df_subject.notnull().sum(axis=1).sort_values().index[-1]
            # If different date columns do exists, keep the earliest instance
            else:
                keep = df_subject_dates.sort_values(df_subject_dates.columns[0], axis=0, ascending=ascending).index[0]
            indexes_to_drop.extend(list(indexes_original - set([keep])))
        df.drop(index=indexes_to_drop)
        return df.drop(index=indexes_to_drop)

get_duplicate_ids(df, id_col='SUBJECT')

Returns dataframe of ID that are duplicated.

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
id_col Optional[str]

Name of subject column. Defaults to 'SUBJECT'.

'SUBJECT'

Returns:

Type Description
DataFrame

pd.DataFrame: DataFrame with Duplicated IDs

Source code in pondtools\load.py
@pf.register_dataframe_method
def get_duplicate_ids(
    df: pd.DataFrame, 
    id_col: Optional[str] = "SUBJECT",
) -> pd.DataFrame:
    """
    Returns dataframe of ID that are duplicated.

    Args:
        df (pd.DataFrame): DataFrame
        id_col (Optional[str], optional): Name of subject column. Defaults to 'SUBJECT'.

    Returns:
        pd.DataFrame: DataFrame with Duplicated IDs
    """

    df = df.copy()
    vc = df[id_col].value_counts()
    duplicate_subjects = (vc[vc > 1]).index.to_list()
    duplicate_subjects
    return df.query(f'{id_col}.isin(@duplicate_subjects)', engine='python')

load_data(filepath=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl'), measure_mappings=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/mapping/MeasureMappings.xlsx'), multicols=True, drop_9000=True, drop_999=True, iq=True, cbcl=True, anxiety=True, language=True, scq=True, rbs=True, demographics=True, geocode=False, cache=True, cache_location=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/LoadedCachedData.pkl'))

Reads a pond export or a pond export merged with additional files and returns a process DataFrame

Parameters:

Name Type Description Default
filepath Optional[str]

Filepath path of unprocessed/raw POND export. Defaults to MERGE_CACHE.

WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl')
measure_mappings Optional[str]

Measure mappings for multicolumn levels. Defaults to MEASURE_MAPPINGS.

WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/mapping/MeasureMappings.xlsx')
multicols Optional[bool]

Option to have multilevel columns determined by the measure_mappings. Defaults to True.

True
drop_9000 Optional[bool]

Option to to filter out columns with values over 9000. Defaults to True.

True
drop_999 Optional[bool]

Option to to filter out columns with values over 999. Defaults to True.

True
iq Optional[bool]

Option to process IQ data. Defaults to True.

True
cbcl Optional[bool]

Option to processes CBCL data. Defaults to True.

True
anxiety Optional[bool]

Option to process Anxiety Data. Defaults to True.

True
language Optional[bool]

Option to process Language Data. Defaults to True.

True
scq Optional[bool]

Option to process SCQ Data. Defaults to True.

True
rbs Optional[bool]

Option to process RBS data. Defaults to True.

True
demographics Optional[bool]

Option to process Demographic Data. Defaults to True.

True
geocode Optional[bool]

Option to process Geocode Data (note that this takes quite a bit of time +5 minutes). Defaults to False.

False
cache Optional[bool]

Option to use cached data instead of re-running interprocessing script again. If `cache=False', the script will run again and the old cache will be overridden. Defaults to True.

True
cache_location Optional[str]

Filepath of where to save and access cache. Defaults to LOAD_CACHE.

WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/LoadedCachedData.pkl')

Returns:

Type Description
DataFrame

pd.DataFrame: Processed DataFrame

Source code in pondtools\load.py
def load_data(
    filepath: Optional[str] = MERGE_CACHE,
    measure_mappings: Optional[str] = MEASURE_MAPPINGS,
    multicols: Optional[bool] = True,
    drop_9000: Optional[bool] = True,
    drop_999: Optional[bool] = True,
    iq: Optional[bool] = True,
    cbcl: Optional[bool] = True,
    anxiety: Optional[bool] = True,
    language: Optional[bool] = True,
    scq: Optional[bool] = True,
    rbs: Optional[bool] = True,
    demographics: Optional[bool] = True,
    geocode: Optional[bool] = False,
    cache:  Optional[bool] = True,
    cache_location: Optional[str] = LOAD_CACHE,
) -> pd.DataFrame:
    """
    Reads a pond export or a pond export merged with additional files and returns a process DataFrame
    Args:
        filepath (Optional[str], optional): Filepath path of unprocessed/raw POND export. Defaults to MERGE_CACHE.
        measure_mappings (Optional[str], optional): Measure mappings for multicolumn levels. Defaults to MEASURE_MAPPINGS.
        multicols (Optional[bool], optional): Option to have multilevel columns determined by the `measure_mappings`. Defaults to True.
        drop_9000 (Optional[bool], optional): Option to to filter out columns with values over 9000. Defaults to True.
        drop_999 (Optional[bool], optional): Option to to filter out columns with values over 999. Defaults to True.
        iq (Optional[bool], optional): Option to process IQ data. Defaults to True.
        cbcl (Optional[bool], optional): Option to processes CBCL data. Defaults to True.
        anxiety (Optional[bool], optional): Option to process Anxiety Data. Defaults to True.
        language (Optional[bool], optional): Option to process Language Data. Defaults to True.
        scq (Optional[bool], optional): Option to process SCQ Data. Defaults to True.
        rbs (Optional[bool], optional): Option to process RBS data. Defaults to True.
        demographics (Optional[bool], optional): Option to process Demographic Data. Defaults to True.
        geocode (Optional[bool], optional): Option to process Geocode Data (note that this takes quite a bit of time +5 minutes). Defaults to False.
        cache (Optional[bool], optional): Option to use cached data instead of re-running interprocessing script again. If `cache=False', the script will run again and the old cache will be overridden. Defaults to True.
        cache_location (Optional[str], optional): Filepath of where to save and access cache. Defaults to LOAD_CACHE.

    Returns:
        pd.DataFrame: Processed DataFrame
    """

    ##########################################################
    # Load data from cache
    ##########################################################
    try:
        if cache:
            return read_general(cache_location)
    except:
        print('Cache does not exit. Set cache=False to create cache.')

    ##########################################################
    # Load data from file
    ##########################################################

    # Import pond data

    if isinstance(filepath, pd.DataFrame):
        df = filepath
    else:
        df = read_general(filepath)

    ##########################################################
    # Dropping unnecessary columns and filtering invalid values
    ##########################################################

    # Drop unnecessary columns
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    # Remove the non-numeric values

    if 'ANXIETYTOTAL' in df.columns:
        df["ANXIETYTOTAL"] = df["ANXIETYTOTAL"].apply(lambda s: _find_floats(s))
        df["ANXIETYTOTAL"] = np.where(df["ANXIETYTOTAL"] > 900, np.nan, df["ANXIETYTOTAL"])


    # Drop missing data code is 9000+
    if drop_9000:
        for col in DROP_9000:
            df = df.filter_columns(col=DROP_9000, threshold=9000, comparison='>', replacement=np.nan)
            if col in df.columns:
                old = df[col]
                df[col] = np.where(old > 9000, np.nan, old)

    # Change string column to numeric
    if "SA_TSCORE" in df.columns:
        df["SA_TSCORE"] = pd.to_numeric(df["SA_TSCORE"])

    # Drop missing data code is 999+
    if drop_999:
        df = df.filter_columns(col=DROP_999, threshold=999, comparison='>=', replacement=np.nan)
        # for col in DROP_999:
        #     if col in df.columns:
        #         old = df[col]
        #         df[col] = np.where(old >= 999, np.nan, old)

    ##########################################################
    # Add extra measure-specific columns
    ##########################################################

    # IQ
    if iq:
        # Combine differend IQ measures
        df = combine_iq(df)

        # Calculate IQ gap
        df["IQ_GAP"] = df["PERF_IQ"] - df["VERB_IQ"]

    # CBCL
    if cbcl:
        # Combine CBCL (0-6 and 6-18)
        df = combine_cbcl(df)

    # Anxiety Measures
    if anxiety:
        # Combine anxiety measures (Spence, RCADS)
        df = combine_anxiety(df)

    # Language
    if language:
        # Combine language measures (OWLS, PLS)
        df = combine_language(df)

    # SCQ
    if scq:       
        # Calculate SCQ subdomains
        df = add_scq_subdomains(df)

    # RBS
    if rbs:
        # Calculate RBS-R Subscales
        df = add_rbs_subscales(df)

    # Demographics
    if demographics:
        # Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses
        df = add_diagnoses_columns(df)

        # Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.
        df = add_combine_cargiver_columns(df)

    # Geocode
    if geocode:
        # Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.
        df = add_geocode_columns(df)


    ##########################################################
    # Rename columns
    ##########################################################

    # Rename sex/gender columns
    df.rename(
        columns={
            "SEX": "SEX_DEMO",
            "NSI_SEX": "SEX",
            "SEX_STD": "SEX_DEMO_STD",
            "NSI_SEX_STD": "SEX_STD",
        },
        inplace=True,
    )

    df.rename(columns=ETHNICITY_MAPPING, inplace=True)

    ##########################################################
    # Drop Duplicates
    ##########################################################

    # Drop Exact Duplicates
    df = df.drop_duplicates()

    # Drop duplicates from multiple RCADS test
    df = drop_duplicate_ids(df)

    ##########################################################
    # Create multi-level columns
    ##########################################################

    if multicols:
        # Get dataframe columns
        df_columns = (
            df.columns.to_frame().reset_index(drop=True).rename(columns={0: "Field"})
        )

        # Get dictionary with category mappings
        measure_mapping_dict = (
            read_general(measure_mappings)
            .melt(var_name="Catergory", value_name="Field")
            .dropna()
            .set_index("Field")
            .to_dict()["Catergory"]
        )

        # Categorize fields and replace columns with multi-index
        df_columns["Category"] = [
            measure_mapping_dict[i] if i in measure_mapping_dict.keys() else "None" for i in df_columns["Field"]
        ]
        df.columns = pd.MultiIndex.from_frame(df_columns[["Category", "Field"]])

        # Set index to participant number
        df["Index"] = df["Participant_Data"]["SUBJECT"]
        df.set_index("Index", inplace=True)

    else:
        df["Index"] = df["SUBJECT"]
        df.set_index("Index", inplace=True)

    ##########################################################
    # Save to cache if running data
    ##########################################################

    # Save df as cache
    df.to_pickle(cache_location)

    return df

load_subset(subset=['Participant_Data', 'SCQ', 'IQ_Summary'], filepath='DATA_FILEPATH', field_cat='FIELD_CAT', drop_upper_level=True, cache=True)

Reads and loads a melted dataframe with specied columns.

Parameters:

Name Type Description Default
filepath Optional[str]

Filepath to excel file.

'DATA_FILEPATH'
field_cat Optional[str]

Excel file with categorization of fields.

'FIELD_CAT'

Returns:

Type Description
DataFrame

pd.DataFrame: Loaded DataFrame

Source code in pondtools\load.py
def load_subset(
    subset: List[str] = ["Participant_Data", "SCQ", "IQ_Summary"],
    filepath: Optional[str] = "DATA_FILEPATH",
    field_cat: Optional[str] = "FIELD_CAT",
    drop_upper_level: Optional[bool] = True,
    cache: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Reads and loads a melted dataframe with specied columns.

    Args:
        filepath: Filepath to excel file.
        field_cat: Excel file with categorization of fields.
    Returns:
        pd.DataFrame: Loaded DataFrame
    """

    df = load_data(filepath, field_cat, cache=cache)

    df = df[subset]
    df.columns = df.columns.droplevel("Category")
    return df

merge_data(pond_export, extra_files=[], merge_extras=True, save=False, save_location=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl'), convert_datetime=True, merge_col='Subject')

Merge official pond export with extra files (typically unofficial exports)

Parameters:

Name Type Description Default
pond_export str

Filepath of the official POND export

required
extra_files List[str]

List of filepath(s) of the extra files (e.g., unofficial POND exports you want to merge with)

[]
merge_extras Optional[bool]

Option to merge extra files. Defaults to True.

True
save Optional[bool]

Option to save file with the cache in pond_tool module. Defaults to False.

False
save_location Optional[str]

Filepath to where you want to save merged file. Defaults to MERGE_CACHE.

WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl')
convert_datetime Optional[bool]

Option to convert columns to datetime type. Defaults to True.

True
merge_col Optional[str]

Name of index/subject column that your extra files are merging to. Defaults to 'Subject'.

'Subject'

Returns:

Type Description
DataFrame

pd.DataFrame: Merged DataFrame

Source code in pondtools\load.py
def merge_data(
    pond_export: str,
    extra_files: List[str] = [],
    merge_extras: Optional[bool] = True,
    save: Optional[bool] = False,
    save_location: Optional[str] = MERGE_CACHE,
    convert_datetime: Optional[bool] = True,
    merge_col: Optional[str] = "Subject",
) -> pd.DataFrame:
    """
    Merge official pond export with extra files (typically unofficial exports)

    Args:
        pond_export (str): Filepath of the official POND export
        extra_files (List[str]): List of filepath(s) of the extra files (e.g., unofficial POND exports you want to merge with)
        merge_extras (Optional[bool], optional): Option to merge extra files. Defaults to True.
        save (Optional[bool], optional): Option to save file with the cache in pond_tool module. Defaults to False.
        save_location (Optional[str], optional): Filepath to where you want to save merged file. Defaults to MERGE_CACHE.
        convert_datetime (Optional[bool], optional): Option to convert columns to datetime type. Defaults to True.
        merge_col (Optional[str], optional): Name of index/subject column that your extra files are merging to. Defaults to 'Subject'.

    Returns:
        pd.DataFrame: Merged DataFrame
    """
    # Load pond export
    df = read_general(pond_export)

    # Merge extra files
    if merge_extras and len(extra_files) != 0:
        if isinstance(extra_files, str):
            extra_files = [extra_files]
        for i, file in enumerate(extra_files):
            try:
                df_file = pd.read_csv(file)
            except:
                df_file = pd.read_excel(file)
            df = df.merge(
                df_file,
                how="left",
                left_on="SUBJECT",
                right_on=merge_col,
                suffixes=["", f"_{i}"],
            )

    # Convert datetime columns
    if convert_datetime:
        df = convert_datetime_cols(df)

    # Save file in cache
    if save:
        # tdate = datetime.today().strftime("%Y-%m-%d")
        # df.to_pickle(get_filepath(fr"cache/{tdate}-POND_Merged_Data.pkl"))
        df.to_pickle(save_location)
    return df

print_duplicate_ids(df, id_col='SUBJECT', return_indexes=True)

Prints the Subject with duplicates IDs along with dates or instances that are different

Parameters:

Name Type Description Default
df DataFrame

DataFrame

required
id_col Optional[str]

Name of subject column. Defaults to 'SUBJECT'.

'SUBJECT'
return_indexes Optional[bool]

Return indexes of duplicated ids/subjects

True
Source code in pondtools\load.py
@pf.register_dataframe_method
def print_duplicate_ids(
    df: pd.DataFrame, 
    id_col: Optional[str] = "SUBJECT",
    return_indexes: Optional[bool] = True
) -> None:
    """
    Prints the Subject with duplicates IDs along with dates or instances that are different

    Args:
        df (pd.DataFrame): DataFrame
        id_col (Optional[str], optional): Name of subject column. Defaults to 'SUBJECT'.
        return_indexes (Optional[bool], optional): Return indexes of duplicated ids/subjects
    """
    df = df.copy()
    vc = df[id_col].value_counts()
    duplicate_subjects = (vc[vc > 1]).index.to_list()

    for subject in duplicate_subjects:
        df_subject = df.query(f"{id_col} == {subject}").get_duplicate_columns(
            duplicate=False
        )
        print(subject)
        print(df_subject.search_columns(["date", "doc", "instance"]))
        print("\n")

    if return_indexes:
        return duplicate_subjects