diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index ba40b46..50ff6ee 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -54,6 +54,7 @@ os.getcwd() # FIXME: local imports #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols +from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() @@ -155,6 +156,8 @@ ncols_m1 = len(mcsm_foldx_dfs.columns) print('\n\nResult of first merge:', mcsm_foldx_dfs.shape , '\n===================================================================') +mcsm_foldx_dfs[merging_cols_m1].apply(len) +mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs) #%%============================================================================ print('===================================' , '\nSecond merge: dssp + kd' @@ -183,6 +186,8 @@ ncols_m3 = len(dssp_kd_rd_dfs.columns) print('\n\nResult of Third merge:', dssp_kd_rd_dfs.shape , '\n===================================================================') +dssp_kd_rd_dfs[merging_cols_m3].apply(len) +dssp_kd_rd_dfs[merging_cols_m3].apply(len) == len(dssp_kd_rd_dfs) #%%============================================================================ print('=======================================' , '\nFourth merge: First merge + Third merge' @@ -203,12 +208,14 @@ else: print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') +combined_df[merging_cols_m4].apply(len) +combined_df[merging_cols_m4].apply(len) == len(combined_df) #%%============================================================================ # OR merges: TEDIOUSSSS!!!! -#%%RRRR +#%% print('===================================' , '\nFifth merge: afor_df + afor_kin_df' , '\n===================================') @@ -220,8 +227,6 @@ afor_df = pd.read_csv(infile_afor, sep = ',') afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',') afor_kin_df.columns = afor_kin_df.columns.str.lower() - - merging_cols_m5 = detect_common_cols(afor_df, afor_kin_df) print('Dim of afor_df:', afor_df.shape @@ -244,7 +249,7 @@ common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) extra_muts_myor = afor_kin_df.shape[0] - common_muts print('==========================================' - , '\nmy or calcs', extra_muts_myor, 'extra mutation\n' + , '\nmy or calcs has', extra_muts_myor, 'extra mutations' , '\n==========================================') print('Expected cals for merging with outer_join...') @@ -261,12 +266,13 @@ else: , '\nCheck expected rows and cols calculation and join type') print('Dim of merged ors_df:', ors_df.shape) + +ors_df[merging_cols_m5].apply(len) +ors_df[merging_cols_m5].apply(len) == len(ors_df) #%%============================================================================ # formatting ors_df - ors_df.columns - # Dropping unncessary columns: already removed in ealier preprocessing #cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] cols_to_drop = ['n_miss'] @@ -324,7 +330,7 @@ column_order = ['mutation' #, 'n_miss' ] -if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all()): +if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): print('PASS: Column order generated for all:', len(column_order), 'columns' , '\nColumn names match, safe to reorder columns' , '\nApplying column order to df...' ) @@ -357,10 +363,35 @@ print('Checking mutations in the two dfs:' #print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) combined_df_all.shape +# populate mut_info_f1 +combined_df_all['mut_info_f1'].isna().sum() +combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type'] +combined_df_all['mut_info_f1'].isna().sum() + +# populate mut_type +combined_df_all['mut_type'].isna().sum() +#mut_type_word = combined_df_all['mut_type'].value_counts() +mut_type_word = 'missense' # FIXME, should be derived +combined_df_all['mut_type'].fillna(mut_type_word, inplace = True) +combined_df_all['mut_type'].isna().sum() + +# populate gene_id +combined_df_all['gene_id'].isna().sum() +#gene_id_word = combined_df_all['gene_id'].value_counts() +gene_id_word = 'Rv2043c' # FIXME, should be derived +combined_df_all['gene_id'].fillna(gene_id_word, inplace = True) +combined_df_all['gene_id'].isna().sum() + +# populate gene_name +combined_df_all['gene_name'].isna().sum() +combined_df_all['gene_name'].value_counts() +combined_df_all['gene_name'].fillna(gene, inplace = True) +combined_df_all['gene_name'].isna().sum() + + # FIXME: DIM # only with left join! outdf_expected_rows = len(combined_df) @@ -383,11 +414,52 @@ else: , '\nmuts in df2 but NOT in df1:' , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) sys.exit() -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# nan in mutation col -# FIXME: should get fixmed with JP's resolved dataset!? -combined_df_all['mutation'].isna().sum() -baz = combined_df_all[combined_df_all['mutation'].isna()] + + +#%% IMPORTANT: check if mutation related info is all populated after this merge +# FIXME: should get fixed with JP's resolved dataset!? +check_nan = combined_df_all.isna().sum(axis = 0) +# relevant mut cols +check_mut_cols = merging_cols_m5 + merging_cols_m6 + +count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) + +if (count_na_mut_cols['na_count'].sum() > 0).any(): + # FIXME: static override, generate 'mutation' from variable + na_muts_n = combined_df_all['mutation'].isna().sum() + baz = combined_df_all[combined_df_all['mutation'].isna()] + baz = baz[check_mut_cols] + print(na_muts_n, 'mutations have missing \'mutation\' info.' + , '\nFetching these from reference dict...') + +lookup_dict = dict() +for k, v in oneletter_aa_dict.items(): + lookup_dict[k] = v['three_letter_code_lower'] + print(lookup_dict) + wt_3let = combined_df_all['wild_type'].map(lookup_dict).str.capitalize() + #print(wt_3let) + pos = combined_df_all['position'].astype(str) + #print(pos) + mt_3let = combined_df_all['mutant_type'].map(lookup_dict).str.capitalize() + #print(mt_3let) + baz['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let + print(combined_df_all['mutation']) + +# populate mut_info_f2 +combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) + +#%% merge +#merging_cols_m7 = detect_common_cols(combined_df_all, baz) + +baz2 = baz[['mutationinformation', 'mut_info_f2']] +baz2 = baz2.drop_duplicates() +merging_cols_m7 = detect_common_cols(combined_df_all, baz2) + +combined_df_all2 = pd.merge(combined_df_all, baz2 + #, on = merging_cols_m7 + , on = 'mutationinformation' + , how = o_join) + #%%============================================================================ output_cols = combined_df_all.columns print('Output cols:', output_cols) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 2f3531a..3f72b1b 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -45,8 +45,6 @@ Created on Tue Aug 6 12:56:03 2019 #5. chain #6. wild_pos #7. wild_chain_pos - - #======================================================================= #%% load libraries import os, sys diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index b1a00ce..17451fb 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -104,7 +104,7 @@ or_df.columns #%% snp_info file: master and gene specific ones # gene info -info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 10 +info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #447, 11 #info_df2 = pd.read_csv(gene_info, sep = ',', header = 0) #447 10 mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100 print('*****RESULT*****' @@ -212,7 +212,7 @@ else: #PENDING Jody's reply # !!!!!!!! -# drop nan from dfm2_mis as these are not useful +# drop nan from dfm2_mis as these are not useful and JP confirmed the same print('Dropping NAs before further processing...') dfm2_mis = dfm2[dfm2['mut_type'].notnull()] # !!!!!!!!