Load

`drop_duplicate_ids(df, id_col='SUBJECT', sort_col=None, ascending=True)`

Drops duplicate ids by a set of criteria

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame	required
`id_col`	`Optional[str]`	Name of ID column. Defaults to "SUBJECT".	`'SUBJECT'`
`sort_col`	`Optional[str]`	Name of column used to sort values. Defaults to None. If None, it will look at date columns with differences and pick the oldest or newest value.	`None`
`ascending`	`Optional[bool]`	Option to sort in ascending order (or descending if False). Defaults to True.	`True`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: DataFrame where duplicated IDs are dropped/

Source code in pondtools\load.py

@pf.register_dataframe_method
def drop_duplicate_ids(
    df: pd.DataFrame,
    id_col: Optional[str] = "SUBJECT",
    sort_col: Optional[str] = None,
    ascending: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Drops duplicate ids by a set of criteria

    Args:
        df (pd.DataFrame): DataFrame
        id_col (Optional[str], optional): Name of ID column. Defaults to "SUBJECT".
        sort_col (Optional[str], optional): Name of column used to sort values. Defaults to None. If None, it will look at date columns with differences and pick the oldest or newest value.
        ascending (Optional[bool], optional): Option to sort in ascending order (or descending if False). Defaults to True.

    Returns:
        pd.DataFrame: DataFrame where duplicated IDs are dropped/
    """
    df = df.copy().drop_duplicates()
    vc = df[id_col].value_counts()
    duplicate_subjects = (vc[vc > 1]).index.to_list()

    if sort_col != None:
        return df.sort_values(sort_col, ascending=ascending).drop_duplicates(subset=[id_col], keep='first')

    else:
        indexes_to_drop = []
        for subject in duplicate_subjects:
            df_subject = (
                df.query(f"{id_col} == {subject}")
                .get_duplicate_columns(duplicate=False)
            )

            df_subject_dates = df_subject.search_columns(["date", "doc", "instance"])
            indexes_original = set(df_subject.index.to_list())
            # If no different date columns, keep instance with the least nulls
            if  df_subject_dates.shape[0]== 0 or df_subject_dates.shape[1]== 0: 
                keep = df_subject.notnull().sum(axis=1).sort_values().index[-1]
            # If different date columns do exists, keep the earliest instance
            else:
                keep = df_subject_dates.sort_values(df_subject_dates.columns[0], axis=0, ascending=ascending).index[0]
            indexes_to_drop.extend(list(indexes_original - set([keep])))
        df.drop(index=indexes_to_drop)
        return df.drop(index=indexes_to_drop)

`get_duplicate_ids(df, id_col='SUBJECT')`

Returns dataframe of ID that are duplicated.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame	required
`id_col`	`Optional[str]`	Name of subject column. Defaults to 'SUBJECT'.	`'SUBJECT'`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: DataFrame with Duplicated IDs

Source code in pondtools\load.py

@pf.register_dataframe_method
def get_duplicate_ids(
    df: pd.DataFrame, 
    id_col: Optional[str] = "SUBJECT",
) -> pd.DataFrame:
    """
    Returns dataframe of ID that are duplicated.

    Args:
        df (pd.DataFrame): DataFrame
        id_col (Optional[str], optional): Name of subject column. Defaults to 'SUBJECT'.

    Returns:
        pd.DataFrame: DataFrame with Duplicated IDs
    """

    df = df.copy()
    vc = df[id_col].value_counts()
    duplicate_subjects = (vc[vc > 1]).index.to_list()
    duplicate_subjects
    return df.query(f'{id_col}.isin(@duplicate_subjects)', engine='python')

load_data(filepath=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl'), measure_mappings=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/mapping/MeasureMappings.xlsx'), multicols=True, drop_9000=True, drop_999=True, iq=True, cbcl=True, anxiety=True, language=True, scq=True, rbs=True, demographics=True, geocode=False, cache=True, cache_location=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/LoadedCachedData.pkl'))

Reads a pond export or a pond export merged with additional files and returns a process DataFrame

Parameters:

Name	Type	Description	Default
`filepath`	`Optional[str]`	Filepath path of unprocessed/raw POND export. Defaults to MERGE_CACHE.	`WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl')`
`measure_mappings`	`Optional[str]`	Measure mappings for multicolumn levels. Defaults to MEASURE_MAPPINGS.	`WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/mapping/MeasureMappings.xlsx')`
`multicols`	`Optional[bool]`	Option to have multilevel columns determined by the `measure_mappings`. Defaults to True.	`True`
`drop_9000`	`Optional[bool]`	Option to to filter out columns with values over 9000. Defaults to True.	`True`
`drop_999`	`Optional[bool]`	Option to to filter out columns with values over 999. Defaults to True.	`True`
`iq`	`Optional[bool]`	Option to process IQ data. Defaults to True.	`True`
`cbcl`	`Optional[bool]`	Option to processes CBCL data. Defaults to True.	`True`
`anxiety`	`Optional[bool]`	Option to process Anxiety Data. Defaults to True.	`True`
`language`	`Optional[bool]`	Option to process Language Data. Defaults to True.	`True`
`scq`	`Optional[bool]`	Option to process SCQ Data. Defaults to True.	`True`
`rbs`	`Optional[bool]`	Option to process RBS data. Defaults to True.	`True`
`demographics`	`Optional[bool]`	Option to process Demographic Data. Defaults to True.	`True`
`geocode`	`Optional[bool]`	Option to process Geocode Data (note that this takes quite a bit of time +5 minutes). Defaults to False.	`False`
`cache`	`Optional[bool]`	Option to use cached data instead of re-running interprocessing script again. If `cache=False', the script will run again and the old cache will be overridden. Defaults to True.	`True`
`cache_location`	`Optional[str]`	Filepath of where to save and access cache. Defaults to LOAD_CACHE.	`WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/LoadedCachedData.pkl')`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Processed DataFrame

Source code in pondtools\load.py

def load_data(
    filepath: Optional[str] = MERGE_CACHE,
    measure_mappings: Optional[str] = MEASURE_MAPPINGS,
    multicols: Optional[bool] = True,
    drop_9000: Optional[bool] = True,
    drop_999: Optional[bool] = True,
    iq: Optional[bool] = True,
    cbcl: Optional[bool] = True,
    anxiety: Optional[bool] = True,
    language: Optional[bool] = True,
    scq: Optional[bool] = True,
    rbs: Optional[bool] = True,
    demographics: Optional[bool] = True,
    geocode: Optional[bool] = False,
    cache:  Optional[bool] = True,
    cache_location: Optional[str] = LOAD_CACHE,
) -> pd.DataFrame:
    """
    Reads a pond export or a pond export merged with additional files and returns a process DataFrame
    Args:
        filepath (Optional[str], optional): Filepath path of unprocessed/raw POND export. Defaults to MERGE_CACHE.
        measure_mappings (Optional[str], optional): Measure mappings for multicolumn levels. Defaults to MEASURE_MAPPINGS.
        multicols (Optional[bool], optional): Option to have multilevel columns determined by the `measure_mappings`. Defaults to True.
        drop_9000 (Optional[bool], optional): Option to to filter out columns with values over 9000. Defaults to True.
        drop_999 (Optional[bool], optional): Option to to filter out columns with values over 999. Defaults to True.
        iq (Optional[bool], optional): Option to process IQ data. Defaults to True.
        cbcl (Optional[bool], optional): Option to processes CBCL data. Defaults to True.
        anxiety (Optional[bool], optional): Option to process Anxiety Data. Defaults to True.
        language (Optional[bool], optional): Option to process Language Data. Defaults to True.
        scq (Optional[bool], optional): Option to process SCQ Data. Defaults to True.
        rbs (Optional[bool], optional): Option to process RBS data. Defaults to True.
        demographics (Optional[bool], optional): Option to process Demographic Data. Defaults to True.
        geocode (Optional[bool], optional): Option to process Geocode Data (note that this takes quite a bit of time +5 minutes). Defaults to False.
        cache (Optional[bool], optional): Option to use cached data instead of re-running interprocessing script again. If `cache=False', the script will run again and the old cache will be overridden. Defaults to True.
        cache_location (Optional[str], optional): Filepath of where to save and access cache. Defaults to LOAD_CACHE.

    Returns:
        pd.DataFrame: Processed DataFrame
    """

    ##########################################################
    # Load data from cache
    ##########################################################
    try:
        if cache:
            return read_general(cache_location)
    except:
        print('Cache does not exit. Set cache=False to create cache.')

    ##########################################################
    # Load data from file
    ##########################################################

    # Import pond data

    if isinstance(filepath, pd.DataFrame):
        df = filepath
    else:
        df = read_general(filepath)

    ##########################################################
    # Dropping unnecessary columns and filtering invalid values
    ##########################################################

    # Drop unnecessary columns
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    # Remove the non-numeric values

    if 'ANXIETYTOTAL' in df.columns:
        df["ANXIETYTOTAL"] = df["ANXIETYTOTAL"].apply(lambda s: _find_floats(s))
        df["ANXIETYTOTAL"] = np.where(df["ANXIETYTOTAL"] > 900, np.nan, df["ANXIETYTOTAL"])


    # Drop missing data code is 9000+
    if drop_9000:
        for col in DROP_9000:
            df = df.filter_columns(col=DROP_9000, threshold=9000, comparison='>', replacement=np.nan)
            if col in df.columns:
                old = df[col]
                df[col] = np.where(old > 9000, np.nan, old)

    # Change string column to numeric
    if "SA_TSCORE" in df.columns:
        df["SA_TSCORE"] = pd.to_numeric(df["SA_TSCORE"])

    # Drop missing data code is 999+
    if drop_999:
        df = df.filter_columns(col=DROP_999, threshold=999, comparison='>=', replacement=np.nan)
        # for col in DROP_999:
        #     if col in df.columns:
        #         old = df[col]
        #         df[col] = np.where(old >= 999, np.nan, old)

    ##########################################################
    # Add extra measure-specific columns
    ##########################################################

    # IQ
    if iq:
        # Combine differend IQ measures
        df = combine_iq(df)

        # Calculate IQ gap
        df["IQ_GAP"] = df["PERF_IQ"] - df["VERB_IQ"]

    # CBCL
    if cbcl:
        # Combine CBCL (0-6 and 6-18)
        df = combine_cbcl(df)

    # Anxiety Measures
    if anxiety:
        # Combine anxiety measures (Spence, RCADS)
        df = combine_anxiety(df)

    # Language
    if language:
        # Combine language measures (OWLS, PLS)
        df = combine_language(df)

    # SCQ
    if scq:       
        # Calculate SCQ subdomains
        df = add_scq_subdomains(df)

    # RBS
    if rbs:
        # Calculate RBS-R Subscales
        df = add_rbs_subscales(df)

    # Demographics
    if demographics:
        # Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses
        df = add_diagnoses_columns(df)

        # Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.
        df = add_combine_cargiver_columns(df)

    # Geocode
    if geocode:
        # Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.
        df = add_geocode_columns(df)


    ##########################################################
    # Rename columns
    ##########################################################

    # Rename sex/gender columns
    df.rename(
        columns={
            "SEX": "SEX_DEMO",
            "NSI_SEX": "SEX",
            "SEX_STD": "SEX_DEMO_STD",
            "NSI_SEX_STD": "SEX_STD",
        },
        inplace=True,
    )

    df.rename(columns=ETHNICITY_MAPPING, inplace=True)

    ##########################################################
    # Drop Duplicates
    ##########################################################

    # Drop Exact Duplicates
    df = df.drop_duplicates()

    # Drop duplicates from multiple RCADS test
    df = drop_duplicate_ids(df)

    ##########################################################
    # Create multi-level columns
    ##########################################################

    if multicols:
        # Get dataframe columns
        df_columns = (
            df.columns.to_frame().reset_index(drop=True).rename(columns={0: "Field"})
        )

        # Get dictionary with category mappings
        measure_mapping_dict = (
            read_general(measure_mappings)
            .melt(var_name="Catergory", value_name="Field")
            .dropna()
            .set_index("Field")
            .to_dict()["Catergory"]
        )

        # Categorize fields and replace columns with multi-index
        df_columns["Category"] = [
            measure_mapping_dict[i] if i in measure_mapping_dict.keys() else "None" for i in df_columns["Field"]
        ]
        df.columns = pd.MultiIndex.from_frame(df_columns[["Category", "Field"]])

        # Set index to participant number
        df["Index"] = df["Participant_Data"]["SUBJECT"]
        df.set_index("Index", inplace=True)

    else:
        df["Index"] = df["SUBJECT"]
        df.set_index("Index", inplace=True)

    ##########################################################
    # Save to cache if running data
    ##########################################################

    # Save df as cache
    df.to_pickle(cache_location)

    return df

`load_subset(subset=['Participant_Data', 'SCQ', 'IQ_Summary'], filepath='DATA_FILEPATH', field_cat='FIELD_CAT', drop_upper_level=True, cache=True)`

Reads and loads a melted dataframe with specied columns.

Parameters:

Name	Type	Description	Default
`filepath`	`Optional[str]`	Filepath to excel file.	`'DATA_FILEPATH'`
`field_cat`	`Optional[str]`	Excel file with categorization of fields.	`'FIELD_CAT'`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Loaded DataFrame

Source code in pondtools\load.py

def load_subset(
    subset: List[str] = ["Participant_Data", "SCQ", "IQ_Summary"],
    filepath: Optional[str] = "DATA_FILEPATH",
    field_cat: Optional[str] = "FIELD_CAT",
    drop_upper_level: Optional[bool] = True,
    cache: Optional[bool] = True,
) -> pd.DataFrame:
    """
    Reads and loads a melted dataframe with specied columns.

    Args:
        filepath: Filepath to excel file.
        field_cat: Excel file with categorization of fields.
    Returns:
        pd.DataFrame: Loaded DataFrame
    """

    df = load_data(filepath, field_cat, cache=cache)

    df = df[subset]
    df.columns = df.columns.droplevel("Category")
    return df

`merge_data(pond_export, extra_files=[], merge_extras=True, save=False, save_location=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl'), convert_datetime=True, merge_col='Subject')`

Merge official pond export with extra files (typically unofficial exports)

Parameters:

Name	Type	Description	Default
`pond_export`	`str`	Filepath of the official POND export	required
`extra_files`	`List[str]`	List of filepath(s) of the extra files (e.g., unofficial POND exports you want to merge with)	`[]`
`merge_extras`	`Optional[bool]`	Option to merge extra files. Defaults to True.	`True`
`save`	`Optional[bool]`	Option to save file with the cache in pond_tool module. Defaults to False.	`False`
`save_location`	`Optional[str]`	Filepath to where you want to save merged file. Defaults to MERGE_CACHE.	`WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl')`
`convert_datetime`	`Optional[bool]`	Option to convert columns to datetime type. Defaults to True.	`True`
`merge_col`	`Optional[str]`	Name of index/subject column that your extra files are merging to. Defaults to 'Subject'.	`'Subject'`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Merged DataFrame

Source code in pondtools\load.py

def merge_data(
    pond_export: str,
    extra_files: List[str] = [],
    merge_extras: Optional[bool] = True,
    save: Optional[bool] = False,
    save_location: Optional[str] = MERGE_CACHE,
    convert_datetime: Optional[bool] = True,
    merge_col: Optional[str] = "Subject",
) -> pd.DataFrame:
    """
    Merge official pond export with extra files (typically unofficial exports)

    Args:
        pond_export (str): Filepath of the official POND export
        extra_files (List[str]): List of filepath(s) of the extra files (e.g., unofficial POND exports you want to merge with)
        merge_extras (Optional[bool], optional): Option to merge extra files. Defaults to True.
        save (Optional[bool], optional): Option to save file with the cache in pond_tool module. Defaults to False.
        save_location (Optional[str], optional): Filepath to where you want to save merged file. Defaults to MERGE_CACHE.
        convert_datetime (Optional[bool], optional): Option to convert columns to datetime type. Defaults to True.
        merge_col (Optional[str], optional): Name of index/subject column that your extra files are merging to. Defaults to 'Subject'.

    Returns:
        pd.DataFrame: Merged DataFrame
    """
    # Load pond export
    df = read_general(pond_export)

    # Merge extra files
    if merge_extras and len(extra_files) != 0:
        if isinstance(extra_files, str):
            extra_files = [extra_files]
        for i, file in enumerate(extra_files):
            try:
                df_file = pd.read_csv(file)
            except:
                df_file = pd.read_excel(file)
            df = df.merge(
                df_file,
                how="left",
                left_on="SUBJECT",
                right_on=merge_col,
                suffixes=["", f"_{i}"],
            )

    # Convert datetime columns
    if convert_datetime:
        df = convert_datetime_cols(df)

    # Save file in cache
    if save:
        # tdate = datetime.today().strftime("%Y-%m-%d")
        # df.to_pickle(get_filepath(fr"cache/{tdate}-POND_Merged_Data.pkl"))
        df.to_pickle(save_location)
    return df

`print_duplicate_ids(df, id_col='SUBJECT', return_indexes=True)`

Prints the Subject with duplicates IDs along with dates or instances that are different

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame	required
`id_col`	`Optional[str]`	Name of subject column. Defaults to 'SUBJECT'.	`'SUBJECT'`
`return_indexes`	`Optional[bool]`	Return indexes of duplicated ids/subjects	`True`

Source code in pondtools\load.py

@pf.register_dataframe_method
def print_duplicate_ids(
    df: pd.DataFrame, 
    id_col: Optional[str] = "SUBJECT",
    return_indexes: Optional[bool] = True
) -> None:
    """
    Prints the Subject with duplicates IDs along with dates or instances that are different

    Args:
        df (pd.DataFrame): DataFrame
        id_col (Optional[str], optional): Name of subject column. Defaults to 'SUBJECT'.
        return_indexes (Optional[bool], optional): Return indexes of duplicated ids/subjects
    """
    df = df.copy()
    vc = df[id_col].value_counts()
    duplicate_subjects = (vc[vc > 1]).index.to_list()

    for subject in duplicate_subjects:
        df_subject = df.query(f"{id_col} == {subject}").get_duplicate_columns(
            duplicate=False
        )
        print(subject)
        print(df_subject.search_columns(["date", "doc", "instance"]))
        print("\n")

    if return_indexes:
        return duplicate_subjects