Load
drop_duplicate_ids(df, id_col='SUBJECT', sort_col=None, ascending=True)
Drops duplicate ids by a set of criteria
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
id_col |
Optional[str] |
Name of ID column. Defaults to "SUBJECT". |
'SUBJECT' |
sort_col |
Optional[str] |
Name of column used to sort values. Defaults to None. If None, it will look at date columns with differences and pick the oldest or newest value. |
None |
ascending |
Optional[bool] |
Option to sort in ascending order (or descending if False). Defaults to True. |
True |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame where duplicated IDs are dropped/ |
Source code in pondtools\load.py
@pf.register_dataframe_method
def drop_duplicate_ids(
df: pd.DataFrame,
id_col: Optional[str] = "SUBJECT",
sort_col: Optional[str] = None,
ascending: Optional[bool] = True,
) -> pd.DataFrame:
"""
Drops duplicate ids by a set of criteria
Args:
df (pd.DataFrame): DataFrame
id_col (Optional[str], optional): Name of ID column. Defaults to "SUBJECT".
sort_col (Optional[str], optional): Name of column used to sort values. Defaults to None. If None, it will look at date columns with differences and pick the oldest or newest value.
ascending (Optional[bool], optional): Option to sort in ascending order (or descending if False). Defaults to True.
Returns:
pd.DataFrame: DataFrame where duplicated IDs are dropped/
"""
df = df.copy().drop_duplicates()
vc = df[id_col].value_counts()
duplicate_subjects = (vc[vc > 1]).index.to_list()
if sort_col != None:
return df.sort_values(sort_col, ascending=ascending).drop_duplicates(subset=[id_col], keep='first')
else:
indexes_to_drop = []
for subject in duplicate_subjects:
df_subject = (
df.query(f"{id_col} == {subject}")
.get_duplicate_columns(duplicate=False)
)
df_subject_dates = df_subject.search_columns(["date", "doc", "instance"])
indexes_original = set(df_subject.index.to_list())
# If no different date columns, keep instance with the least nulls
if df_subject_dates.shape[0]== 0 or df_subject_dates.shape[1]== 0:
keep = df_subject.notnull().sum(axis=1).sort_values().index[-1]
# If different date columns do exists, keep the earliest instance
else:
keep = df_subject_dates.sort_values(df_subject_dates.columns[0], axis=0, ascending=ascending).index[0]
indexes_to_drop.extend(list(indexes_original - set([keep])))
df.drop(index=indexes_to_drop)
return df.drop(index=indexes_to_drop)
get_duplicate_ids(df, id_col='SUBJECT')
Returns dataframe of ID that are duplicated.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
id_col |
Optional[str] |
Name of subject column. Defaults to 'SUBJECT'. |
'SUBJECT' |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: DataFrame with Duplicated IDs |
Source code in pondtools\load.py
@pf.register_dataframe_method
def get_duplicate_ids(
df: pd.DataFrame,
id_col: Optional[str] = "SUBJECT",
) -> pd.DataFrame:
"""
Returns dataframe of ID that are duplicated.
Args:
df (pd.DataFrame): DataFrame
id_col (Optional[str], optional): Name of subject column. Defaults to 'SUBJECT'.
Returns:
pd.DataFrame: DataFrame with Duplicated IDs
"""
df = df.copy()
vc = df[id_col].value_counts()
duplicate_subjects = (vc[vc > 1]).index.to_list()
duplicate_subjects
return df.query(f'{id_col}.isin(@duplicate_subjects)', engine='python')
load_data(filepath=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl'), measure_mappings=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/mapping/MeasureMappings.xlsx'), multicols=True, drop_9000=True, drop_999=True, iq=True, cbcl=True, anxiety=True, language=True, scq=True, rbs=True, demographics=True, geocode=False, cache=True, cache_location=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/LoadedCachedData.pkl'))
Reads a pond export or a pond export merged with additional files and returns a process DataFrame
Parameters:
Name | Type | Description | Default |
---|---|---|---|
filepath |
Optional[str] |
Filepath path of unprocessed/raw POND export. Defaults to MERGE_CACHE. |
WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl') |
measure_mappings |
Optional[str] |
Measure mappings for multicolumn levels. Defaults to MEASURE_MAPPINGS. |
WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/mapping/MeasureMappings.xlsx') |
multicols |
Optional[bool] |
Option to have multilevel columns determined by the |
True |
drop_9000 |
Optional[bool] |
Option to to filter out columns with values over 9000. Defaults to True. |
True |
drop_999 |
Optional[bool] |
Option to to filter out columns with values over 999. Defaults to True. |
True |
iq |
Optional[bool] |
Option to process IQ data. Defaults to True. |
True |
cbcl |
Optional[bool] |
Option to processes CBCL data. Defaults to True. |
True |
anxiety |
Optional[bool] |
Option to process Anxiety Data. Defaults to True. |
True |
language |
Optional[bool] |
Option to process Language Data. Defaults to True. |
True |
scq |
Optional[bool] |
Option to process SCQ Data. Defaults to True. |
True |
rbs |
Optional[bool] |
Option to process RBS data. Defaults to True. |
True |
demographics |
Optional[bool] |
Option to process Demographic Data. Defaults to True. |
True |
geocode |
Optional[bool] |
Option to process Geocode Data (note that this takes quite a bit of time +5 minutes). Defaults to False. |
False |
cache |
Optional[bool] |
Option to use cached data instead of re-running interprocessing script again. If `cache=False', the script will run again and the old cache will be overridden. Defaults to True. |
True |
cache_location |
Optional[str] |
Filepath of where to save and access cache. Defaults to LOAD_CACHE. |
WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/LoadedCachedData.pkl') |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: Processed DataFrame |
Source code in pondtools\load.py
def load_data(
filepath: Optional[str] = MERGE_CACHE,
measure_mappings: Optional[str] = MEASURE_MAPPINGS,
multicols: Optional[bool] = True,
drop_9000: Optional[bool] = True,
drop_999: Optional[bool] = True,
iq: Optional[bool] = True,
cbcl: Optional[bool] = True,
anxiety: Optional[bool] = True,
language: Optional[bool] = True,
scq: Optional[bool] = True,
rbs: Optional[bool] = True,
demographics: Optional[bool] = True,
geocode: Optional[bool] = False,
cache: Optional[bool] = True,
cache_location: Optional[str] = LOAD_CACHE,
) -> pd.DataFrame:
"""
Reads a pond export or a pond export merged with additional files and returns a process DataFrame
Args:
filepath (Optional[str], optional): Filepath path of unprocessed/raw POND export. Defaults to MERGE_CACHE.
measure_mappings (Optional[str], optional): Measure mappings for multicolumn levels. Defaults to MEASURE_MAPPINGS.
multicols (Optional[bool], optional): Option to have multilevel columns determined by the `measure_mappings`. Defaults to True.
drop_9000 (Optional[bool], optional): Option to to filter out columns with values over 9000. Defaults to True.
drop_999 (Optional[bool], optional): Option to to filter out columns with values over 999. Defaults to True.
iq (Optional[bool], optional): Option to process IQ data. Defaults to True.
cbcl (Optional[bool], optional): Option to processes CBCL data. Defaults to True.
anxiety (Optional[bool], optional): Option to process Anxiety Data. Defaults to True.
language (Optional[bool], optional): Option to process Language Data. Defaults to True.
scq (Optional[bool], optional): Option to process SCQ Data. Defaults to True.
rbs (Optional[bool], optional): Option to process RBS data. Defaults to True.
demographics (Optional[bool], optional): Option to process Demographic Data. Defaults to True.
geocode (Optional[bool], optional): Option to process Geocode Data (note that this takes quite a bit of time +5 minutes). Defaults to False.
cache (Optional[bool], optional): Option to use cached data instead of re-running interprocessing script again. If `cache=False', the script will run again and the old cache will be overridden. Defaults to True.
cache_location (Optional[str], optional): Filepath of where to save and access cache. Defaults to LOAD_CACHE.
Returns:
pd.DataFrame: Processed DataFrame
"""
##########################################################
# Load data from cache
##########################################################
try:
if cache:
return read_general(cache_location)
except:
print('Cache does not exit. Set cache=False to create cache.')
##########################################################
# Load data from file
##########################################################
# Import pond data
if isinstance(filepath, pd.DataFrame):
df = filepath
else:
df = read_general(filepath)
##########################################################
# Dropping unnecessary columns and filtering invalid values
##########################################################
# Drop unnecessary columns
if "Unnamed: 0" in df.columns:
df = df.drop(columns=["Unnamed: 0"])
# Remove the non-numeric values
if 'ANXIETYTOTAL' in df.columns:
df["ANXIETYTOTAL"] = df["ANXIETYTOTAL"].apply(lambda s: _find_floats(s))
df["ANXIETYTOTAL"] = np.where(df["ANXIETYTOTAL"] > 900, np.nan, df["ANXIETYTOTAL"])
# Drop missing data code is 9000+
if drop_9000:
for col in DROP_9000:
df = df.filter_columns(col=DROP_9000, threshold=9000, comparison='>', replacement=np.nan)
if col in df.columns:
old = df[col]
df[col] = np.where(old > 9000, np.nan, old)
# Change string column to numeric
if "SA_TSCORE" in df.columns:
df["SA_TSCORE"] = pd.to_numeric(df["SA_TSCORE"])
# Drop missing data code is 999+
if drop_999:
df = df.filter_columns(col=DROP_999, threshold=999, comparison='>=', replacement=np.nan)
# for col in DROP_999:
# if col in df.columns:
# old = df[col]
# df[col] = np.where(old >= 999, np.nan, old)
##########################################################
# Add extra measure-specific columns
##########################################################
# IQ
if iq:
# Combine differend IQ measures
df = combine_iq(df)
# Calculate IQ gap
df["IQ_GAP"] = df["PERF_IQ"] - df["VERB_IQ"]
# CBCL
if cbcl:
# Combine CBCL (0-6 and 6-18)
df = combine_cbcl(df)
# Anxiety Measures
if anxiety:
# Combine anxiety measures (Spence, RCADS)
df = combine_anxiety(df)
# Language
if language:
# Combine language measures (OWLS, PLS)
df = combine_language(df)
# SCQ
if scq:
# Calculate SCQ subdomains
df = add_scq_subdomains(df)
# RBS
if rbs:
# Calculate RBS-R Subscales
df = add_rbs_subscales(df)
# Demographics
if demographics:
# Add columns relating to co-occurring ASD, ADHD, OCD and TD Diagnoses
df = add_diagnoses_columns(df)
# Adds sum, max, min and mean columns to combine CAREGIVER 1 and 2 demographic data.
df = add_combine_cargiver_columns(df)
# Geocode
if geocode:
# Adds geocode data. Determines based on postal code what public health unit participant belongs in. Also merges census data.
df = add_geocode_columns(df)
##########################################################
# Rename columns
##########################################################
# Rename sex/gender columns
df.rename(
columns={
"SEX": "SEX_DEMO",
"NSI_SEX": "SEX",
"SEX_STD": "SEX_DEMO_STD",
"NSI_SEX_STD": "SEX_STD",
},
inplace=True,
)
df.rename(columns=ETHNICITY_MAPPING, inplace=True)
##########################################################
# Drop Duplicates
##########################################################
# Drop Exact Duplicates
df = df.drop_duplicates()
# Drop duplicates from multiple RCADS test
df = drop_duplicate_ids(df)
##########################################################
# Create multi-level columns
##########################################################
if multicols:
# Get dataframe columns
df_columns = (
df.columns.to_frame().reset_index(drop=True).rename(columns={0: "Field"})
)
# Get dictionary with category mappings
measure_mapping_dict = (
read_general(measure_mappings)
.melt(var_name="Catergory", value_name="Field")
.dropna()
.set_index("Field")
.to_dict()["Catergory"]
)
# Categorize fields and replace columns with multi-index
df_columns["Category"] = [
measure_mapping_dict[i] if i in measure_mapping_dict.keys() else "None" for i in df_columns["Field"]
]
df.columns = pd.MultiIndex.from_frame(df_columns[["Category", "Field"]])
# Set index to participant number
df["Index"] = df["Participant_Data"]["SUBJECT"]
df.set_index("Index", inplace=True)
else:
df["Index"] = df["SUBJECT"]
df.set_index("Index", inplace=True)
##########################################################
# Save to cache if running data
##########################################################
# Save df as cache
df.to_pickle(cache_location)
return df
load_subset(subset=['Participant_Data', 'SCQ', 'IQ_Summary'], filepath='DATA_FILEPATH', field_cat='FIELD_CAT', drop_upper_level=True, cache=True)
Reads and loads a melted dataframe with specied columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
filepath |
Optional[str] |
Filepath to excel file. |
'DATA_FILEPATH' |
field_cat |
Optional[str] |
Excel file with categorization of fields. |
'FIELD_CAT' |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: Loaded DataFrame |
Source code in pondtools\load.py
def load_subset(
subset: List[str] = ["Participant_Data", "SCQ", "IQ_Summary"],
filepath: Optional[str] = "DATA_FILEPATH",
field_cat: Optional[str] = "FIELD_CAT",
drop_upper_level: Optional[bool] = True,
cache: Optional[bool] = True,
) -> pd.DataFrame:
"""
Reads and loads a melted dataframe with specied columns.
Args:
filepath: Filepath to excel file.
field_cat: Excel file with categorization of fields.
Returns:
pd.DataFrame: Loaded DataFrame
"""
df = load_data(filepath, field_cat, cache=cache)
df = df[subset]
df.columns = df.columns.droplevel("Category")
return df
merge_data(pond_export, extra_files=[], merge_extras=True, save=False, save_location=WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl'), convert_datetime=True, merge_col='Subject')
Merge official pond export with extra files (typically unofficial exports)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
pond_export |
str |
Filepath of the official POND export |
required |
extra_files |
List[str] |
List of filepath(s) of the extra files (e.g., unofficial POND exports you want to merge with) |
[] |
merge_extras |
Optional[bool] |
Option to merge extra files. Defaults to True. |
True |
save |
Optional[bool] |
Option to save file with the cache in pond_tool module. Defaults to False. |
False |
save_location |
Optional[str] |
Filepath to where you want to save merged file. Defaults to MERGE_CACHE. |
WindowsPath('C:/Users/nguye/Documents/Thesis/pond_tools/pondtools/resources/cache/POND_Merged_Data.pkl') |
convert_datetime |
Optional[bool] |
Option to convert columns to datetime type. Defaults to True. |
True |
merge_col |
Optional[str] |
Name of index/subject column that your extra files are merging to. Defaults to 'Subject'. |
'Subject' |
Returns:
Type | Description |
---|---|
DataFrame |
pd.DataFrame: Merged DataFrame |
Source code in pondtools\load.py
def merge_data(
pond_export: str,
extra_files: List[str] = [],
merge_extras: Optional[bool] = True,
save: Optional[bool] = False,
save_location: Optional[str] = MERGE_CACHE,
convert_datetime: Optional[bool] = True,
merge_col: Optional[str] = "Subject",
) -> pd.DataFrame:
"""
Merge official pond export with extra files (typically unofficial exports)
Args:
pond_export (str): Filepath of the official POND export
extra_files (List[str]): List of filepath(s) of the extra files (e.g., unofficial POND exports you want to merge with)
merge_extras (Optional[bool], optional): Option to merge extra files. Defaults to True.
save (Optional[bool], optional): Option to save file with the cache in pond_tool module. Defaults to False.
save_location (Optional[str], optional): Filepath to where you want to save merged file. Defaults to MERGE_CACHE.
convert_datetime (Optional[bool], optional): Option to convert columns to datetime type. Defaults to True.
merge_col (Optional[str], optional): Name of index/subject column that your extra files are merging to. Defaults to 'Subject'.
Returns:
pd.DataFrame: Merged DataFrame
"""
# Load pond export
df = read_general(pond_export)
# Merge extra files
if merge_extras and len(extra_files) != 0:
if isinstance(extra_files, str):
extra_files = [extra_files]
for i, file in enumerate(extra_files):
try:
df_file = pd.read_csv(file)
except:
df_file = pd.read_excel(file)
df = df.merge(
df_file,
how="left",
left_on="SUBJECT",
right_on=merge_col,
suffixes=["", f"_{i}"],
)
# Convert datetime columns
if convert_datetime:
df = convert_datetime_cols(df)
# Save file in cache
if save:
# tdate = datetime.today().strftime("%Y-%m-%d")
# df.to_pickle(get_filepath(fr"cache/{tdate}-POND_Merged_Data.pkl"))
df.to_pickle(save_location)
return df
print_duplicate_ids(df, id_col='SUBJECT', return_indexes=True)
Prints the Subject with duplicates IDs along with dates or instances that are different
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
DataFrame |
required |
id_col |
Optional[str] |
Name of subject column. Defaults to 'SUBJECT'. |
'SUBJECT' |
return_indexes |
Optional[bool] |
Return indexes of duplicated ids/subjects |
True |
Source code in pondtools\load.py
@pf.register_dataframe_method
def print_duplicate_ids(
df: pd.DataFrame,
id_col: Optional[str] = "SUBJECT",
return_indexes: Optional[bool] = True
) -> None:
"""
Prints the Subject with duplicates IDs along with dates or instances that are different
Args:
df (pd.DataFrame): DataFrame
id_col (Optional[str], optional): Name of subject column. Defaults to 'SUBJECT'.
return_indexes (Optional[bool], optional): Return indexes of duplicated ids/subjects
"""
df = df.copy()
vc = df[id_col].value_counts()
duplicate_subjects = (vc[vc > 1]).index.to_list()
for subject in duplicate_subjects:
df_subject = df.query(f"{id_col} == {subject}").get_duplicate_columns(
duplicate=False
)
print(subject)
print(df_subject.search_columns(["date", "doc", "instance"]))
print("\n")
if return_indexes:
return duplicate_subjects