From f27c223bdd3bec5d46a40ca2d2fd53f66a853190 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 14 Jul 2020 14:09:42 +0100 Subject: [PATCH] resolving merge conflicts dur to shoddy data --- scripts/combining_FIXME.py | 179 +++++++++++++++ scripts/combining_dfs.py | 454 +++++++++++++++++++++++++++++++++++++ 2 files changed, 633 insertions(+) create mode 100755 scripts/combining_FIXME.py create mode 100755 scripts/combining_dfs.py diff --git a/scripts/combining_FIXME.py b/scripts/combining_FIXME.py new file mode 100755 index 0000000..030dea6 --- /dev/null +++ b/scripts/combining_FIXME.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +# FIXME: change filename 2(mcsm normalised data) +# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline +#======================================================================= +# Task: combine 2 dfs on comm_valson cols by detecting them +# includes sainity checks + +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +import numpy as np +import re +#from varname import nameof + +#%% end of variable assignment for input and output files +#======================================================================= +#%% function/methd to combine dfs + +def detect_common_cols (df1, df2): + """ + Detect comm_valson cols + + @param df1: df + @type df1: pandas df + + @param df2: df + @type df2: pandas df + + @return: comm_valson cols + @type: list + """ + common_cols = np.intersect1d(df1.columns, df2.columns).tolist() + print('Length of comm_cols:', len(common_cols) + , '\nMerging column/s:', common_cols + , '\n---------------------------------------------------------------' + , '\nType:', type(common_cols) + , '\n\ndtypes in merging columns:\n', df1[common_cols].dtypes + , '\n---------------------------------------------------------------') + + return common_cols + +#%% Function to combine 2 dfs by detecting commom cols and performing +# sanity checks on the output df +def combine_dfs_with_checks(df1, df2, my_join = 'outer'): + """ + Combine 2 dfs by finding merging columns automatically + + @param df1: data frame + @type df1: pandas df + + @param df2: data frame + @type df2: pandas df + + @my_join: join type for merging + @type my_join: string + + @return: combined_df + @type: pandas df + """ + + print('Finding comm_cols and merging cols:' + ,'\n=========================================================') + + common_cols = np.intersect1d(df1.columns, df2.columns).tolist() + print('Length of comm_cols:', len(common_cols) + , '\nmerging column/s:', common_cols + , '\ntype:', type(common_cols)) + + #print('\ndtypes in merging columns:\n', df1[common_cols].dtypes) + + print('selecting consistent dtypes for merging (object i.e string)') + #merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist() + #merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist() + merging_cols = common_cols.copy() + + nmerging_cols = len(merging_cols) + print(' length of merging cols:', nmerging_cols + , '\nmerging cols:', merging_cols, 'type:', type(merging_cols) + , '\n=========================================================') + + #======================== + # merge 1 (combined_df) + # concatenating 2dfs: + # df1, df2 + #======================== + # checking cross-over of mutations in the two dfs to merge + ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum() + ndiff1 = df1.shape[0] - ndiff_1 + print('There are', ndiff1, 'unmatched mutations in left df') + + #missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])] + #missing_mutinfo.to_csv('infoless_muts.csv') + + ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum() + ndiff2 = df2.shape[0] - ndiff_2 + print('There are', ndiff2, 'unmatched mutations in right_df') + + #comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) + #comm_vals_count = len(comm_vals) + #print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count)) + + #======================== + # merging dfs & sanity checks + #======================== + fail = False + print('combing with:', my_join) + comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join) + + expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols + + + if my_join == 'right': + df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') + expected_rows = df2_nd.shape[0] + + if my_join == 'left': + expected_rows = df1.shape[0] + + + #if my_join == 'inner': + # expected_rows = comm_vals_count + + #if my_join == 'outer': + # df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') + # df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') + # expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count + + + if my_join == ('inner' or 'outer') and len(merging_cols) > 1: + #comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation']) + print('length of merging_cols > 1, therefore omitting row checks') + combined_df = comb_df.copy() + expected_rows = len(combined_df) + + else: + comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols]) + print('length of merging_cols == 1, calculating expected rows in merged_df') + combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first') + if my_join == 'inner': + expected_rows = len(comm_vals) + if my_join == 'outer': + df1_nd = df1.drop_duplicates(merging_cols, keep = 'first') + df2_nd = df2.drop_duplicates(merging_cols, keep = 'first') + expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals) + + if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols: + print('PASS: successfully combined dfs with:', my_join, 'join') + else: + print('FAIL: combined_df\'s expected rows and cols not matched') + fail = True + print('\nExpected no. of rows:', expected_rows + , '\nGot:', len(combined_df) + , '\nExpected no. of cols:', expected_cols + , '\nGot:', len(combined_df.columns)) + if fail: + sys.exit() + + #if clean: + #foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1) + #print(foo.columns) + #print('Detected duplicate cols with suffix: _x _y' + # , '\Dropping duplicate cols and cleaning') + + # drop position col containing suffix '_y' and then rename col without suffix + combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1) + combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True) + + return combined_df_clean + +#%% end of function +#======================================================================= + diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py new file mode 100755 index 0000000..d5253b8 --- /dev/null +++ b/scripts/combining_dfs.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' +#======================================================================= +# Task: combining all dfs to a single one + +# Input: 8 dfs +#1) .lower()'_complex_mcsm_norm.csv' +#2) .lower()_foldx.csv' +#3) .lower()_dssp.csv' +#4) .lower()_kd.csv' +#5) .lower()_rd.csv' +#6) 'ns' + .lower()_snp_info.csv' +#7) .lower()_af_or.csv' +#8) .lower() _af_or_kinship.csv + +# combining order +#Merge1 = 1 + 2 + +#Merge2 = 3 + 4 +#Merge3 = Merge2 + 5 + +#Merge4 = Merge1 + Merge3 + +#Merge5 = 6 + 7 +#Merge6 = Merge5 + 8 + +#Merge7 = Merge4 + Merge6 + +# Output: single csv of all 8 dfs combined +# useful link +# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns +#======================================================================= +#%% load packages +import sys, os +import pandas as pd +from pandas import DataFrame +import numpy as np +#from varname import nameof +import argparse +#======================================================================= +#%% specify input and curr dir +homedir = os.path.expanduser('~') + +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() + +# FIXME: local imports +#from combining import combine_dfs_with_checks +from combining_FIXME import detect_common_cols +#======================================================================= +#%% command line args +#arg_parser = argparse.ArgumentParser() +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive + +#arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +#arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +#arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') + +#arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +#args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output +drug = 'pyrazinamide' +gene = 'pncA' +gene_match = gene + '_p.' + +#drug = args.drug +#gene = args.gene +#datadir = args.datadir +#indir = args.input_dir +#outdir = args.output_dir +#%%======================================================================= +#============== +# directories +#============== +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input' + +if not outdir: + outdir = datadir + '/' + drug + '/output' + +#======= +# input +#======= +in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' +in_filename_foldx = gene.lower() + '_foldx.csv' +in_filename_dssp = gene.lower() + '_dssp.csv' +in_filename_kd = gene.lower() + '_kd.csv' +in_filename_rd = gene.lower() + '_rd.csv' +in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv' +in_filename_afor = gene.lower() + '_af_or.csv' +in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' + + +infile_mcsm = outdir + '/' + in_filename_mcsm +infile_foldx = outdir + '/' + in_filename_foldx +infile_dssp = outdir + '/' + in_filename_dssp +infile_kd = outdir + '/' + in_filename_kd +infile_rd = outdir + '/' + in_filename_rd +infile_snpinfo = indir + '/' + in_filename_snpinfo +infile_afor = outdir + '/' + in_filename_afor +infile_afor_kin = outdir + '/' + in_filename_afor_kin + + +print('\nInput path:', indir + , '\nOutput path:', outdir + , '\nInput filename mcsm:', infile_mcsm + , '\nInput filename foldx:', infile_foldx + , '\nInput filename dssp:', infile_dssp + , '\nInput filename kd:', infile_kd + , '\nInput filename rd', infile_rd + , '\nInput filename snp info:', infile_snpinfo + , '\nInput filename af or:', infile_afor + , '\nInput filename afor kinship:', infile_afor_kin + , '\n============================================================') + +#======= +# output +#======= +out_filename_comb = gene.lower() + '_all_params.csv' +outfile_comb = outdir + '/' + out_filename_comb +print('Output filename:', outfile_comb + , '\n===================================================================') + +o_join = 'outer' +l_join = 'left' +r_join = 'right' +i_join = 'inner' + +# end of variable assignment for input and output files +#%%============================================================================ +print('===================================' + , '\nFirst merge: mcsm + foldx' + , '\n===================================') + +mcsm_df = pd.read_csv(infile_mcsm, sep = ',') +#mcsm_df.columns = mcsm_df.columns.str.lower() +foldx_df = pd.read_csv(infile_foldx , sep = ',') + +#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join) +merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df) +mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = o_join) +ncols_m1 = len(mcsm_foldx_dfs.columns) + +print('\n\nResult of first merge:', mcsm_foldx_dfs.shape + , '\n===================================================================') +#%%============================================================================ +print('===================================' + , '\nSecond merge: dssp + kd' + , '\n===================================') + +dssp_df = pd.read_csv(infile_dssp, sep = ',') +kd_df = pd.read_csv(infile_kd, sep = ',') +rd_df = pd.read_csv(infile_rd, sep = ',') + +#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join) +merging_cols_m2 = detect_common_cols(dssp_df, kd_df) +dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = o_join) + +print('\n\nResult of second merge:', dssp_kd_dfs.shape + , '\n===================================================================') +#%%============================================================================ +print('===================================' + , '\nThird merge: second merge + rd_df' + , '\ndssp_kd_dfs + rd_df' + , '\n===================================') +#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join) +merging_cols_m3 = detect_common_cols(dssp_df, kd_df) +dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = o_join) + +ncols_m3 = len(dssp_kd_rd_dfs.columns) + +print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape + , '\n===================================================================') +#%%============================================================================ +print('=======================================' + , '\nFourth merge: First merge + Third merge' + , '\nmcsm_foldx_dfs + dssp_kd_rd_dfs' + , '\n=======================================') +#combined_df = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join) +merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs) +combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = i_join) + +combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4) + +if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: + print('PASS: successfully combined 5 dfs' + , '\nNo. of rows combined_df:', len(combined_df) + , '\nNo. of cols combined_df:', len(combined_df.columns)) +else: + sys.exit('FAIL: check individual df merges') + +print('\nResult of Fourth merge:', combined_df.shape + , '\n===================================================================') +#%%============================================================================ + +# OR merges: TEDIOUSSSS!!!! + +#%%============================================================================ +print('===================================' + , '\nFifth merge: afor_df + snpinfo_df' + , '\n===================================') + +# OR combining +afor_df = pd.read_csv(infile_afor, sep = ',') +#afor_df.columns = afor_df.columns.str.lower() + +snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',') +#snpinfo_df_all.columns = snpinfo_df_all.columns.str.lower() + +#afor_snpinfo_dfs = combine_dfs_with_checks(afor_df, snpinfo_df_all, my_join = i_join) +merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df_all) +afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df_all, on = merging_cols_m5, how = l_join) + +# finding mutations lacking meta data +foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])] +foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])] + +bar = snpinfo_df_all[~snpinfo_df_all['mutation'].isin(afor_df['mutation'])] +bar1 = snpinfo_df_all[snpinfo_df_all['mutation'].isin(afor_df['mutation'])] + +# checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# afor_df +if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique(): + print('No duplicate mutations detected in afor_df') +else: + print('Dropping duplicate mutations detected in afor_df') + afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first') + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# finding mutations lacking meta data +# FIXME: should get fixmed with JP's resolved dataset!? +print('There are', len(afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])]) + , 'mutations with various or calculated that have no additional info...STRANGE' + , 'Reported to Jody on 14 july 2020 on skype!') +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +foo = afor_df[~afor_df['mutation'].isin(snpinfo_df_all['mutation'])] +foo1 = afor_df[afor_df['mutation'].isin(snpinfo_df_all['mutation'])] + +# snpinfo_df_all +ndups = 0 +if not snpinfo_df_all['mutation'].shape[0] == snpinfo_df_all['mutation'].nunique(): + ndups = snpinfo_df_all['mutation'].duplicated().sum() + print(ndups, 'duplicated muts detected in snpinfo_df_all.' + , '\nHowever these may have different nucleotide changes. Checking further...') + #expected_nrows = afor_df.shape[0] + ndups +cols_to_check = ['mutation', 'mutationinformation', 'ref_allele', 'alt_allele'] + +if snpinfo_df_all.duplicated(subset = cols_to_check).sum() == 0: + print('No *REAL* duplicate muts detected in snpinfo_df_all' + , '\nDim of df:', snpinfo_df_all.shape) + snpinfo_df_all = snpinfo_df_all.copy() +else: + print(snpinfo_df_all.duplicated(subset = cols_to_check).sum() + , ' Actual duplicate mutations detected in snpinfo_df_all') + dup_muts = snpinfo_df_all[['mutation', 'mutationinformation']][snpinfo_df_all.duplicated(subset = cols_to_check)] + print(len(dup_muts), 'duplicated mutation detected' + , '\nDropping duplicated mutations before merging') + snpinfo_df_all = snpinfo_df_all.drop_duplicates(subset = cols_to_check, keep = 'first') + print('Dim of df after removing duplicates:', snpinfo_df_all.shape) + + +if len(afor_snpinfo_dfs) == afor_df.shape[0] + ndups: + print('PASS: succesfully combined with left join') +else: + print('FAIL: unsuccessful merge' + , '\nDim of df1:', afor_df.shape + , '\nDim of df2:', snpinfo_df_all.shape) + sys.exit() + +print('\nResult of Fifth merge:', afor_snpinfo_dfs.shape + , '\n===================================================================') +#%%============================================================================ +print('===================================' + , '\nSixth merge: fifth merge + afor_kin_df' + , '\nafor_snpinfo_dfs + afor_kin_df' + , '\n===================================') +afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') +afor_kin_df.columns = afor_kin_df.columns.str.lower() + +#ors_df = combine_dfs_with_checks(afor_snpinfo_dfs, afor_kin_df, my_join = o_join) +merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df) +print('Dim of df1:', afor_snpinfo_dfs.shape + , '\nDim of df2:', afor_kin_df.shape + , '\nNo. of merging_cols:', len(merging_cols_m6)) + +ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = o_join) + +# Dropping unncessary columns +cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] +print('Dropping', len(cols_to_drop), 'columns:\n' + , cols_to_drop) +ors_df.drop(cols_to_drop, axis = 1, inplace = True) + +print('Reordering', ors_df.shape[1], 'columns' + , '\n===============================================') +cols = ors_df.columns + +column_order = ['mutation' + , 'mutationinformation' + , 'wild_type' + , 'position' + , 'mutant_type' + , 'chr_num_allele' + , 'ref_allele' + , 'alt_allele' + , 'mut_info' + , 'mut_type' + , 'gene_id' + , 'gene_number' + , 'mut_region' + #, 'reference_allele' + #, 'alternate_allele' + , 'chromosome_number' + , 'af' + , 'af_kin' + , 'est_chisq' + , 'or_mychisq' + , 'or_fisher' + , 'or_logistic' + , 'or_kin' + , 'pval_chisq' + , 'pval_fisher' + , 'pval_logistic' + , 'pwald_kin' + , 'ci_low_fisher' + , 'ci_hi_fisher' + , 'ci_low_logistic' + , 'ci_hi_logistic' + , 'beta_logistic' + , 'beta_kin' + , 'se_logistic' + , 'se_kin' + , 'zval_logistic' + , 'logl_h1_kin' + , 'l_remle_kin' + , 'wt_3let' + , 'mt_3let' + #, 'symbol' + , 'n_miss'] + +if len(column_order) == ors_df.shape[1] == len(DataFrame(column_order).isin(ors_df.columns)): + print('PASS: Column order generated for all columns in df', len(column_order), 'columns' + , '\nApplying column order to df...' ) + ors_df_ordered = ors_df[column_order] +else: + print('FAIL: Mismatch in no. of cols to reorder' + , '\nNo. of cols in df to reorder:', ors_df.shape[1] + , '\nNo. of cols order generated for:', len(column_order)) + sys.exit() + +print('\nResult of Sixth merge:', ors_df_ordered.shape + , '\n===================================================================') +#%%============================================================================ +print('===================================' + , '\nSeventh merge: Fourth + Sixth merge' + , '\ncombined_df + ors_df_ordered' + , '\n===================================') + +#combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) +merging_cols_m7 = detect_common_cols(combined_df, ors_df_ordered) +print('Dim of df1:', combined_df.shape + , '\nDim of df2:', ors_df_ordered.shape + , '\nNo. of merging_cols:', len(merging_cols_m7)) + +print('Checking mutations in the two dfs:' + , '\nmuts in df1 but NOT in df2:' + , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() + , '\nmuts in df2 but NOT in df1:' + , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum()) + +#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) + +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = o_join) +#combined_df_all.shape + +# FIXME: DIM +# only with left join! +outdf_expected_rows = len(combined_df) +outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m7) + +#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows: +if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows: + print('PASS: Df dimension match' + , '\nDim of combined_df_all with join type:', o_join + , '\n', combined_df_all.shape + , '\n===============================================================') +else: + print('FAIL: Df dimension mismatch' + , 'Cannot generate expected dim. See details of merge performed' + , '\ndf1 dim:', combined_df.shape + , '\ndf2 dim:', ors_df.shape + , '\nGot:', combined_df_all.shape + , '\nmuts in df1 but NOT in df2:' + , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() + , '\nmuts in df2 but NOT in df1:' + , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) + sys.exit() +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# nan in mutation col +# FIXME: should get fixmed with JP's resolved dataset!? +combined_df_all['mutation'].isna().sum() +baz = combined_df_all[combined_df_all['mutation'].isna()] +#%%============================================================================ +output_cols = combined_df_all.columns +print('Output cols:', output_cols) + +#%%============================================================================ +# write csv +print('Writing file: combined output of all params needed for plotting and ML') +combined_df_all.to_csv(outfile_comb, index = False) +print('\nFinished writing file:' + , '\nNo. of rows:', combined_df_all.shape[0] + , '\nNo. of cols:', combined_df_all.shape[1]) + + +#======================================================================= +#%% incase you FIX the the function: combine_dfs_with_checks +#def main(): + +# print('Reading input files:') + #mcsm_df = pd.read_csv(infile_mcsm, sep = ',') + #mcsm_df.columns = mcsm_df.columns.str.lower() + + #foldx_df = pd.read_csv(infile_foldx , sep = ',') + + #dssp_df = pd.read_csv(infile_dssp, sep = ',') + #dssp_df.columns = dssp_df.columns.str.lower() + + #kd_df = pd.read_csv(infile_kd, sep = ',') + #kd_df.columns = kd_df.columns.str.lower() + + #rd_df = pd.read_csv(infile_kd, sep = ',') + + + +#if __name__ == '__main__': +# main() +#======================================================================= +#%% end of script