updated script to combine dfs
This commit is contained in:
parent
841d18d10b
commit
acd0b8355b
3 changed files with 98 additions and 112 deletions
|
@ -31,6 +31,7 @@ if(is.null(drug)|is.null(gene)) {
|
||||||
#options(scipen = 999) #disabling scientific notation in R.
|
#options(scipen = 999) #disabling scientific notation in R.
|
||||||
#========================================================
|
#========================================================
|
||||||
#%% variable assignment: input and output paths & filenames
|
#%% variable assignment: input and output paths & filenames
|
||||||
|
|
||||||
gene_match = paste0(gene,'_p.')
|
gene_match = paste0(gene,'_p.')
|
||||||
cat(gene_match)
|
cat(gene_match)
|
||||||
|
|
||||||
|
@ -46,7 +47,7 @@ outdir = paste0(datadir, '/', drug, '/', 'output')
|
||||||
#===========
|
#===========
|
||||||
# input file 1: master data
|
# input file 1: master data
|
||||||
#in_filename_master = 'original_tanushree_data_v2.csv' #19K
|
#in_filename_master = 'original_tanushree_data_v2.csv' #19K
|
||||||
in_filename_master = 'mtb_gwas_meta_v3.csv' #33k
|
in_filename_master = 'mtb_gwas_meta_v6.csv' #35k
|
||||||
infile_master = paste0(datadir, '/', in_filename_master)
|
infile_master = paste0(datadir, '/', in_filename_master)
|
||||||
cat(paste0('Reading infile1: raw data', ' ', infile_master) )
|
cat(paste0('Reading infile1: raw data', ' ', infile_master) )
|
||||||
|
|
||||||
|
@ -324,7 +325,7 @@ x = sapply(snps,function(m){
|
||||||
#%%======================================================
|
#%%======================================================
|
||||||
# Writing file with calculated ORs and AFs
|
# Writing file with calculated ORs and AFs
|
||||||
cat(paste0('writing output file: '
|
cat(paste0('writing output file: '
|
||||||
, '\nFilen: ', outfile_af_or))
|
, '\nFile: ', outfile_af_or))
|
||||||
|
|
||||||
write.csv(ors_df, outfile_af_or
|
write.csv(ors_df, outfile_af_or
|
||||||
, row.names = F)
|
, row.names = F)
|
||||||
|
|
|
@ -71,7 +71,7 @@ args = arg_parser.parse_args()
|
||||||
#%% variable assignment: input and output
|
#%% variable assignment: input and output
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'pyrazinamide'
|
||||||
#gene = 'pncA'
|
#gene = 'pncA'
|
||||||
#gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
|
|
||||||
drug = args.drug
|
drug = args.drug
|
||||||
gene = args.gene
|
gene = args.gene
|
||||||
|
@ -99,7 +99,7 @@ in_filename_foldx = gene.lower() + '_foldx.csv'
|
||||||
in_filename_dssp = gene.lower() + '_dssp.csv'
|
in_filename_dssp = gene.lower() + '_dssp.csv'
|
||||||
in_filename_kd = gene.lower() + '_kd.csv'
|
in_filename_kd = gene.lower() + '_kd.csv'
|
||||||
in_filename_rd = gene.lower() + '_rd.csv'
|
in_filename_rd = gene.lower() + '_rd.csv'
|
||||||
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
|
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv'
|
||||||
in_filename_afor = gene.lower() + '_af_or.csv'
|
in_filename_afor = gene.lower() + '_af_or.csv'
|
||||||
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
|
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
|
||||||
|
|
||||||
|
@ -109,19 +109,18 @@ infile_foldx = outdir + '/' + in_filename_foldx
|
||||||
infile_dssp = outdir + '/' + in_filename_dssp
|
infile_dssp = outdir + '/' + in_filename_dssp
|
||||||
infile_kd = outdir + '/' + in_filename_kd
|
infile_kd = outdir + '/' + in_filename_kd
|
||||||
infile_rd = outdir + '/' + in_filename_rd
|
infile_rd = outdir + '/' + in_filename_rd
|
||||||
infile_snpinfo = indir + '/' + in_filename_snpinfo
|
infile_snpinfo = outdir + '/' + in_filename_snpinfo
|
||||||
infile_afor = outdir + '/' + in_filename_afor
|
infile_afor = outdir + '/' + in_filename_afor
|
||||||
infile_afor_kin = outdir + '/' + in_filename_afor_kin
|
infile_afor_kin = outdir + '/' + in_filename_afor_kin
|
||||||
|
|
||||||
|
|
||||||
print('\nInput path:', indir
|
print('\nInput path:', indir
|
||||||
, '\nOutput path:', outdir
|
, '\nOutput path:', outdir, '\n'
|
||||||
, '\nInput filename mcsm:', infile_mcsm
|
, '\nInput filename mcsm:', infile_mcsm
|
||||||
, '\nInput filename foldx:', infile_foldx
|
, '\nInput filename foldx:', infile_foldx, '\n'
|
||||||
, '\nInput filename dssp:', infile_dssp
|
, '\nInput filename dssp:', infile_dssp
|
||||||
, '\nInput filename kd:', infile_kd
|
, '\nInput filename kd:', infile_kd
|
||||||
, '\nInput filename rd', infile_rd
|
, '\nInput filename rd', infile_rd , '\n'
|
||||||
, '\nInput filename snp info:', infile_snpinfo
|
, '\nInput filename snp info:', infile_snpinfo, '\n'
|
||||||
, '\nInput filename af or:', infile_afor
|
, '\nInput filename af or:', infile_afor
|
||||||
, '\nInput filename afor kinship:', infile_afor_kin
|
, '\nInput filename afor kinship:', infile_afor_kin
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
@ -208,99 +207,69 @@ print('\nResult of Fourth merge:', combined_df.shape
|
||||||
|
|
||||||
# OR merges: TEDIOUSSSS!!!!
|
# OR merges: TEDIOUSSSS!!!!
|
||||||
|
|
||||||
#%%============================================================================
|
|
||||||
|
#%%RRRR
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nFifth merge: afor_df + snpinfo_df'
|
, '\nFifth merge: afor_df + afor_kin_df'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
|
||||||
# OR combining
|
# OR combining
|
||||||
afor_df = pd.read_csv(infile_afor, sep = ',')
|
afor_df = pd.read_csv(infile_afor, sep = ',')
|
||||||
#afor_df.columns = afor_df.columns.str.lower()
|
#afor_df.columns = afor_df.columns.str.lower()
|
||||||
|
|
||||||
snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',')
|
|
||||||
#snpinfo_df_all.columns = snpinfo_df_all.columns.str.lower()
|
|
||||||
|
|
||||||
#afor_snpinfo_dfs = combine_dfs_with_checks(afor_df, snpinfo_df_all, my_join = i_join)
|
|
||||||
merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df_all)
|
|
||||||
afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df_all, on = merging_cols_m5, how = l_join)
|
|
||||||
|
|
||||||
# finding mutations lacking meta data
|
|
||||||
foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
|
||||||
foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
|
||||||
|
|
||||||
bar = snpinfo_df_all[~snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
|
|
||||||
bar1 = snpinfo_df_all[snpinfo_df_all['mutation'].isin(afor_df['mutation'])]
|
|
||||||
|
|
||||||
# checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
# afor_df
|
|
||||||
if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
|
|
||||||
print('No duplicate mutations detected in afor_df')
|
|
||||||
else:
|
|
||||||
print('Dropping duplicate mutations detected in afor_df')
|
|
||||||
afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
|
|
||||||
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
# finding mutations lacking meta data
|
|
||||||
# FIXME: should get fixmed with JP's resolved dataset!?
|
|
||||||
print('There are', len(afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])])
|
|
||||||
, 'mutations with various or calculated that have no additional info...STRANGE'
|
|
||||||
, 'Reported to Jody on 14 july 2020 on skype!')
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
|
|
||||||
foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
|
||||||
foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])]
|
|
||||||
|
|
||||||
# snpinfo_df_all
|
|
||||||
ndups = 0
|
|
||||||
if not snpinfo_df_all['mutation'].shape[0] == snpinfo_df_all['mutation'].nunique():
|
|
||||||
ndups = snpinfo_df_all['mutation'].duplicated().sum()
|
|
||||||
print(ndups, 'duplicated muts detected in snpinfo_df_all.'
|
|
||||||
, '\nHowever these may have different nucleotide changes. Checking further...')
|
|
||||||
#expected_nrows = afor_df.shape[0] + ndups
|
|
||||||
cols_to_check = ['mutation', 'mutationinformation', 'ref_allele', 'alt_allele']
|
|
||||||
|
|
||||||
if snpinfo_df_all.duplicated(subset = cols_to_check).sum() == 0:
|
|
||||||
print('No *REAL* duplicate muts detected in snpinfo_df_all'
|
|
||||||
, '\nDim of df:', snpinfo_df_all.shape)
|
|
||||||
snpinfo_df_all = snpinfo_df_all.copy()
|
|
||||||
else:
|
|
||||||
print(snpinfo_df_all.duplicated(subset = cols_to_check).sum()
|
|
||||||
, ' Actual duplicate mutations detected in snpinfo_df_all')
|
|
||||||
dup_muts = snpinfo_df_all[['mutation', 'mutationinformation']][snpinfo_df_all.duplicated(subset = cols_to_check)]
|
|
||||||
print(len(dup_muts), 'duplicated mutation detected'
|
|
||||||
, '\nDropping duplicated mutations before merging')
|
|
||||||
snpinfo_df_all = snpinfo_df_all.drop_duplicates(subset = cols_to_check, keep = 'first')
|
|
||||||
print('Dim of df after removing duplicates:', snpinfo_df_all.shape)
|
|
||||||
|
|
||||||
|
|
||||||
if len(afor_snpinfo_dfs) == afor_df.shape[0] + ndups:
|
|
||||||
print('PASS: succesfully combined with left join')
|
|
||||||
else:
|
|
||||||
print('FAIL: unsuccessful merge'
|
|
||||||
, '\nDim of df1:', afor_df.shape
|
|
||||||
, '\nDim of df2:', snpinfo_df_all.shape)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
print('\nResult of Fifth merge:', afor_snpinfo_dfs.shape
|
|
||||||
, '\n===================================================================')
|
|
||||||
#%%============================================================================
|
|
||||||
print('==================================='
|
|
||||||
, '\nSixth merge: fifth merge + afor_kin_df'
|
|
||||||
, '\nafor_snpinfo_dfs + afor_kin_df'
|
|
||||||
, '\n===================================')
|
|
||||||
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
|
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
|
||||||
afor_kin_df.columns = afor_kin_df.columns.str.lower()
|
afor_kin_df.columns = afor_kin_df.columns.str.lower()
|
||||||
|
|
||||||
#ors_df = combine_dfs_with_checks(afor_snpinfo_dfs, afor_kin_df, my_join = o_join)
|
|
||||||
merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
|
|
||||||
print('Dim of df1:', afor_snpinfo_dfs.shape
|
|
||||||
, '\nDim of df2:', afor_kin_df.shape
|
|
||||||
, '\nNo. of merging_cols:', len(merging_cols_m6))
|
|
||||||
|
|
||||||
ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = o_join)
|
|
||||||
|
|
||||||
# Dropping unncessary columns
|
merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df)
|
||||||
cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
|
|
||||||
|
print('Dim of afor_df:', afor_df.shape
|
||||||
|
, '\nDim of afor_kin_df:', afor_kin_df.shape)
|
||||||
|
|
||||||
|
# finding if ALL afor_kin_df muts are present in afor_df
|
||||||
|
# i.e all kinship muts should be PRESENT in mycalcs_present
|
||||||
|
if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]:
|
||||||
|
print('PASS: ALL or_kinship muts are present in my or list')
|
||||||
|
else:
|
||||||
|
nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])])
|
||||||
|
nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]
|
||||||
|
print('FAIL:', nf_muts, 'muts present in afor_kin_df NOT present in my or list'
|
||||||
|
, '\nsee "nf_muts_df" created containing not found(nf) muts')
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
|
# Now checking how many afor_df muts are NOT present in afor_kin_df
|
||||||
|
common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])])
|
||||||
|
extra_muts_myor = afor_kin_df.shape[0] - common_muts
|
||||||
|
|
||||||
|
print('=========================================='
|
||||||
|
, '\nmy or calcs', extra_muts_myor, 'extra mutation\n'
|
||||||
|
, '\n==========================================')
|
||||||
|
|
||||||
|
print('Expected cals for merging with outer_join...')
|
||||||
|
|
||||||
|
expected_rows = afor_df.shape[0] + extra_muts_myor
|
||||||
|
expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5)
|
||||||
|
|
||||||
|
ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join)
|
||||||
|
|
||||||
|
if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols:
|
||||||
|
print('PASS: OR dfs successfully combined! PHEWWWW!')
|
||||||
|
else:
|
||||||
|
print('FAIL: could not combine OR dfs'
|
||||||
|
, '\nCheck expected rows and cols calculation and join type')
|
||||||
|
|
||||||
|
print('Dim of merged ors_df:', ors_df.shape)
|
||||||
|
#%%============================================================================
|
||||||
|
# formatting ors_df
|
||||||
|
|
||||||
|
ors_df.columns
|
||||||
|
|
||||||
|
|
||||||
|
# Dropping unncessary columns: already removed in ealier preprocessing
|
||||||
|
#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ]
|
||||||
|
cols_to_drop = ['n_miss']
|
||||||
print('Dropping', len(cols_to_drop), 'columns:\n'
|
print('Dropping', len(cols_to_drop), 'columns:\n'
|
||||||
, cols_to_drop)
|
, cols_to_drop)
|
||||||
ors_df.drop(cols_to_drop, axis = 1, inplace = True)
|
ors_df.drop(cols_to_drop, axis = 1, inplace = True)
|
||||||
|
@ -314,14 +283,16 @@ column_order = ['mutation'
|
||||||
, 'wild_type'
|
, 'wild_type'
|
||||||
, 'position'
|
, 'position'
|
||||||
, 'mutant_type'
|
, 'mutant_type'
|
||||||
, 'chr_num_allele'
|
#, 'chr_num_allele' #old
|
||||||
, 'ref_allele'
|
, 'ref_allele'
|
||||||
, 'alt_allele'
|
, 'alt_allele'
|
||||||
, 'mut_info'
|
, 'mut_info_f1'
|
||||||
|
, 'mut_info_f2'
|
||||||
, 'mut_type'
|
, 'mut_type'
|
||||||
, 'gene_id'
|
, 'gene_id'
|
||||||
, 'gene_number'
|
#, 'gene_number' #old
|
||||||
, 'mut_region'
|
, 'gene_name'
|
||||||
|
#, 'mut_region'
|
||||||
#, 'reference_allele'
|
#, 'reference_allele'
|
||||||
#, 'alternate_allele'
|
#, 'alternate_allele'
|
||||||
, 'chromosome_number'
|
, 'chromosome_number'
|
||||||
|
@ -347,13 +318,15 @@ column_order = ['mutation'
|
||||||
, 'zval_logistic'
|
, 'zval_logistic'
|
||||||
, 'logl_h1_kin'
|
, 'logl_h1_kin'
|
||||||
, 'l_remle_kin'
|
, 'l_remle_kin'
|
||||||
, 'wt_3let'
|
#, 'wt_3let' # old
|
||||||
, 'mt_3let'
|
#, 'mt_3let' # old
|
||||||
#, 'symbol'
|
#, 'symbol'
|
||||||
, 'n_miss']
|
#, 'n_miss'
|
||||||
|
]
|
||||||
|
|
||||||
if len(column_order) == ors_df.shape[1] == len(DataFrame(column_order).isin(ors_df.columns)):
|
if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()):
|
||||||
print('PASS: Column order generated for all columns in df', len(column_order), 'columns'
|
print('PASS: Column order generated for all:', len(column_order), 'columns'
|
||||||
|
, '\nColumn names match, safe to reorder columns'
|
||||||
, '\nApplying column order to df...' )
|
, '\nApplying column order to df...' )
|
||||||
ors_df_ordered = ors_df[column_order]
|
ors_df_ordered = ors_df[column_order]
|
||||||
else:
|
else:
|
||||||
|
@ -366,15 +339,15 @@ print('\nResult of Sixth merge:', ors_df_ordered.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
print('==================================='
|
print('==================================='
|
||||||
, '\nSeventh merge: Fourth + Sixth merge'
|
, '\nSixth merge: Fourth + Fifth merge'
|
||||||
, '\ncombined_df + ors_df_ordered'
|
, '\ncombined_df + ors_df_ordered'
|
||||||
, '\n===================================')
|
, '\n===================================')
|
||||||
|
|
||||||
#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
|
#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join)
|
||||||
merging_cols_m7 = detect_common_cols(combined_df, ors_df_ordered)
|
merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered)
|
||||||
print('Dim of df1:', combined_df.shape
|
print('Dim of df1:', combined_df.shape
|
||||||
, '\nDim of df2:', ors_df_ordered.shape
|
, '\nDim of df2:', ors_df_ordered.shape
|
||||||
, '\nNo. of merging_cols:', len(merging_cols_m7))
|
, '\nNo. of merging_cols:', len(merging_cols_m6))
|
||||||
|
|
||||||
print('Checking mutations in the two dfs:'
|
print('Checking mutations in the two dfs:'
|
||||||
, '\nmuts in df1 but NOT in df2:'
|
, '\nmuts in df1 but NOT in df2:'
|
||||||
|
@ -385,13 +358,13 @@ print('Checking mutations in the two dfs:'
|
||||||
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
|
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) )
|
||||||
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = l_join)
|
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join)
|
||||||
#combined_df_all.shape
|
combined_df_all.shape
|
||||||
|
|
||||||
# FIXME: DIM
|
# FIXME: DIM
|
||||||
# only with left join!
|
# only with left join!
|
||||||
outdf_expected_rows = len(combined_df)
|
outdf_expected_rows = len(combined_df)
|
||||||
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m7)
|
outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6)
|
||||||
|
|
||||||
#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:
|
#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows:
|
||||||
if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
|
if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows:
|
||||||
|
@ -426,8 +399,6 @@ combined_df_all.to_csv(outfile_comb, index = False)
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_df_all.shape[0]
|
, '\nNo. of rows:', combined_df_all.shape[0]
|
||||||
, '\nNo. of cols:', combined_df_all.shape[1])
|
, '\nNo. of cols:', combined_df_all.shape[1])
|
||||||
|
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% incase you FIX the the function: combine_dfs_with_checks
|
#%% incase you FIX the the function: combine_dfs_with_checks
|
||||||
#def main():
|
#def main():
|
||||||
|
|
|
@ -261,6 +261,20 @@ else:
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
del(df_ncols, ncols_add)
|
del(df_ncols, ncols_add)
|
||||||
|
|
||||||
|
#%% now adding mutation style = <gene>_p.abc1cde
|
||||||
|
dfm2_mis['mutation'] = gene.lower() + '_' + dfm2_mis['mut_info_f2'].astype(str)
|
||||||
|
# convert to lowercase
|
||||||
|
dfm2_mis['mutation'] = dfm2_mis['mutation'].str.lower()
|
||||||
|
|
||||||
|
# quick sanity check
|
||||||
|
check = dfm2_mis['mutation'].value_counts().value_counts() == dfm2_mis['mut_info_f2'].value_counts().value_counts()
|
||||||
|
|
||||||
|
if check.all():
|
||||||
|
print('PASS: added column "mutation" containing mutation format: <gene>_p.abc1cde')
|
||||||
|
else:
|
||||||
|
print('FAIL: could not add "mutation" column!')
|
||||||
|
sys.exit()
|
||||||
#%% Calculating OR from beta coeff
|
#%% Calculating OR from beta coeff
|
||||||
print('Calculating OR...')
|
print('Calculating OR...')
|
||||||
df_ncols = dfm2_mis.shape[1]
|
df_ncols = dfm2_mis.shape[1]
|
||||||
|
@ -364,7 +378,7 @@ print('Reordering', dfm2_mis.shape[1], 'columns'
|
||||||
|
|
||||||
#dfm2_mis.columns
|
#dfm2_mis.columns
|
||||||
|
|
||||||
column_order = [#'mutation',
|
column_order = ['mutation',
|
||||||
'mutationinformation',
|
'mutationinformation',
|
||||||
'wild_type',
|
'wild_type',
|
||||||
'position',
|
'position',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue