From 5429b8fed70b52638c93b980fda36bfa93afba94 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 28 Apr 2022 13:02:30 +0100 Subject: [PATCH] saving data extraction updated script --- scripts/data_extraction.py | 61 +++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index be9fc9a..bc418d1 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -68,7 +68,7 @@ os.getcwd() arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) -arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for Data, input and output', action='store_true') @@ -93,13 +93,13 @@ if not datadir: datadir = homedir + '/' + 'git/Data' if not indir: - indir = datadir + '/' + drug + '/input_v2' + indir = datadir + '/' + drug + '/input' if not outdir: - outdir = datadir + '/' + drug + '/output_v2' + outdir = datadir + '/' + drug + '/output' if make_dirs: - print('make_dirs is turned on, creating data dir:', datadir) + print('make_dirs is turned on, creating data dir (unless it already exists):', datadir) try: os.makedirs(datadir, exist_ok = True) print("Directory '%s' created successfully" %datadir) @@ -1411,7 +1411,7 @@ gene_LF3['drtype_numeric'] = gene_LF3['drtype'].map(drtype_map) gene_LF3['drtype'].value_counts() gene_LF3['drtype_numeric'].value_counts() -#%% Multimode: drtype +# Multimode: drtype #============================= # Recalculation: Revised drtype # max(multimode) @@ -1520,7 +1520,7 @@ gene_LF3['dst_multimode'].value_counts() # Now get the max from multimode #gene_LF3['dst_mode'] = gene_LF3.groupby('mutationinformation')['dst_noNA'].max() # this somehow is not right! #gene_LF3['dst_noNA'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) -gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) +gene_LF3['dst_mode'] = gene_LF3['dst_multimode'].apply(lambda x: np.nanmax(x)) #ML # sanity checks #gene_LF3['dst_noNA'].equals(gene_LF3['dst_mode']) @@ -1969,8 +1969,55 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: , '\n=============================================================' , '\nPost resolving ambiguity\n' , ambig_muts_rev_df['mutation_info_REV'].value_counts()) + +print('\n=============================================================' + , '\n=============================================================' + , '\###############################\n' + , '\nNumbers for ML workflows...' + , '\n###############################\n' + + , '\ncolumn name [drug, old]:', drug, '\n' + , gene_LF4[drug].value_counts() + , '\nTotal drug samples[old]:', gene_LF4[drug].value_counts().sum() + , '\nPercentages:\n' + , gene_LF4[drug].value_counts(normalize = True) + , '\n-------------------------------------------------------------' + + , '\ncolumn name [drug, revised]: dst_mode\n' + , gene_LF4['dst_mode'].value_counts() + , '\nTotal drug samples[revised]:', gene_LF4['dst_mode'].value_counts().sum() + , '\nPercentages:\n' + , gene_LF4['dst_mode'].value_counts(normalize = True) + , '\n-------------------------------------------------------------' + + , '\n-------------------------------------------------------------' + , '\ncolumn name: drtye_mode\n' + , gene_LF4['drtype_mode'].value_counts() + , '\nTotal drtype_mode:', gene_LF4['drtype_mode'].value_counts().sum() + , '\nPercentages:\n' + , gene_LF4['drtype_mode'].value_counts(normalize = True) + , '\n-------------------------------------------------------------' + + , '\n-------------------------------------------------------------' + , '\ncolumn name [dm_om, old]: mutation_info_orig\n' + , gene_LF4['mutation_info_orig'].value_counts() + , '\nTotal mutation_info_orig:', gene_LF4['mutation_info_orig'].value_counts().sum() + , '\nPercentages:\n' + , gene_LF4['mutation_info_orig'].value_counts(normalize = True) + , '\n-------------------------------------------------------------' + + , '\n-------------------------------------------------------------' + , '\ncolumn name [dm_om, revised]: mutation_info\n' + , gene_LF4['mutation_info'].value_counts() + , '\nTotal mutation_info:', gene_LF4['mutation_info'].value_counts().sum() + , '\nPercentages:\n' + , gene_LF4['mutation_info'].value_counts(normalize = True) + , '\n-------------------------------------------------------------' + + , '\n=============================================================' + ) #======================================================================= print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files' '\n' + u'\u2698' * 50 ) -#%% end of script \ No newline at end of file +#%% end of script