From e4608342a4324cade4eb6ef78dd9fc25141f7e68 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 8 Sep 2020 17:13:02 +0100 Subject: [PATCH] various changes --- scripts/combining_dfs.py | 224 +++++++++++++++------- scripts/data_extraction.py | 41 ++-- scripts/plotting/combining_dfs_plotting.R | 29 ++- 3 files changed, 199 insertions(+), 95 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 622e8e5..be62177 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -55,6 +55,7 @@ os.getcwd() #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE! +from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() @@ -79,6 +80,21 @@ gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir + +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) #%%======================================================================= #============== # directories @@ -214,7 +230,9 @@ combined_df[merging_cols_m4].apply(len) == len(combined_df) # OR merges: TEDIOUSSSS!!!! - +del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd) +del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4) +del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd) #%% print('===================================' , '\nFifth merge: afor_df + afor_kin_df' @@ -235,7 +253,7 @@ print('Dim of afor_df:', afor_df.shape # finding if ALL afor_kin_df muts are present in afor_df # i.e all kinship muts should be PRESENT in mycalcs_present if len(afor_kin_df[afor_kin_df['mutation'].isin(afor_df['mutation'])]) == afor_kin_df.shape[0]: - print('PASS: ALL or_kinship muts are present in my or list') + print('PASS: ALL', len(afor_kin_df), 'or_kinship muts are present in my or list') else: nf_muts = len(afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])]) nf_muts_df = afor_kin_df[~afor_kin_df['mutation'].isin(afor_df['mutation'])] @@ -246,10 +264,10 @@ else: # Now checking how many afor_df muts are NOT present in afor_kin_df common_muts = len(afor_df[afor_df['mutation'].isin(afor_kin_df['mutation'])]) -extra_muts_myor = afor_kin_df.shape[0] - common_muts +extra_muts_myor = afor_kin_df.shape[0] - common_muts print('==========================================' - , '\nmy or calcs has', extra_muts_myor, 'extra mutations' + , '\nmy or calcs has', common_muts, 'present in af_or_kin_df' , '\n==========================================') print('Expected cals for merging with outer_join...') @@ -257,10 +275,15 @@ print('Expected cals for merging with outer_join...') expected_rows = afor_df.shape[0] + extra_muts_myor expected_cols = afor_df.shape[1] + afor_kin_df.shape[1] - len(merging_cols_m5) + +afor_df['mutation'] +afor_kin_df['mutation'] + ors_df = pd.merge(afor_df, afor_kin_df, on = merging_cols_m5, how = o_join) if ors_df.shape[0] == expected_rows and ors_df.shape[1] == expected_cols: - print('PASS: OR dfs successfully combined! PHEWWWW!') + print('PASS but with duplicate muts: OR dfs successfully combined! PHEWWWW!' + , '\nDuplicate muts present but with different \'ref\' and \'alt\' alleles') else: print('FAIL: could not combine OR dfs' , '\nCheck expected rows and cols calculation and join type') @@ -269,12 +292,12 @@ print('Dim of merged ors_df:', ors_df.shape) ors_df[merging_cols_m5].apply(len) ors_df[merging_cols_m5].apply(len) == len(ors_df) + #%%============================================================================ # formatting ors_df ors_df.columns # Dropping unncessary columns: already removed in ealier preprocessing -#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] cols_to_drop = ['n_miss'] print('Dropping', len(cols_to_drop), 'columns:\n' , cols_to_drop) @@ -327,7 +350,6 @@ column_order = ['mutation' #, 'wt_3let' # old #, 'mt_3let' # old #, 'symbol' - #, 'n_miss' ] if ( (len(column_order) == ors_df.shape[1]) and (DataFrame(column_order).isin(ors_df.columns).all().all())): @@ -343,6 +365,61 @@ else: print('\nResult of Sixth merge:', ors_df_ordered.shape , '\n===================================================================') +#%% +ors_df_ordered.shape +check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] + +# populating 'nan' info +lookup_dict = dict() +for k, v in low_3letter_dict.items(): + lookup_dict[k] = v['one_letter_code'] + #print(lookup_dict) + + wt = ors_df_ordered['mutation'].str.extract(wt_regex).squeeze() + #print(wt) + ors_df_ordered['wild_type'] = wt.map(lookup_dict) + + ors_df_ordered['position'] = ors_df_ordered['mutation'].str.extract(pos_regex) + + mt = ors_df_ordered['mutation'].str.extract(mut_regex).squeeze() + ors_df_ordered['mutant_type'] = mt.map(lookup_dict) + +ors_df_ordered['mutationinformation'] = ors_df_ordered['wild_type'] + ors_df_ordered.position.map(str) + ors_df_ordered['mutant_type'] +check = ors_df_ordered[['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type']] + +# populate mut_info_f1 +ors_df_ordered['mut_info_f1'].isna().sum() +ors_df_ordered['mut_info_f1'] = ors_df_ordered['position'].astype(str) + ors_df_ordered['wild_type'] + '>' + ors_df_ordered['position'].astype(str) + ors_df_ordered['mutant_type'] +ors_df_ordered['mut_info_f1'].isna().sum() + +# populate mut_info_f2 +ors_df_ordered['mut_info_f2'] = ors_df_ordered['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) + +# populate mut_type +ors_df_ordered['mut_type'].isna().sum() +#mut_type_word = ors_df_ordered['mut_type'].value_counts() +mut_type_word = 'missense' # FIXME, should be derived +ors_df_ordered['mut_type'].fillna(mut_type_word, inplace = True) +ors_df_ordered['mut_type'].isna().sum() + +# populate gene_id +ors_df_ordered['gene_id'].isna().sum() +#gene_id_word = ors_df_ordered['gene_id'].value_counts() +gene_id_word = 'Rv2043c' # FIXME, should be derived +ors_df_ordered['gene_id'].fillna(gene_id_word, inplace = True) +ors_df_ordered['gene_id'].isna().sum() + +# populate gene_name +ors_df_ordered['gene_name'].isna().sum() +ors_df_ordered['gene_name'].value_counts() +ors_df_ordered['gene_name'].fillna(gene, inplace = True) +ors_df_ordered['gene_name'].isna().sum() + +# check numbers +ors_df_ordered['or_kin'].isna().sum() +# should be 0 +ors_df_ordered['or_mychisq'].isna().sum() + #%%============================================================================ print('===================================' , '\nSixth merge: Fourth + Fifth merge' @@ -350,79 +427,94 @@ print('===================================' , '\n===================================') #combined_df_all = combine_dfs_with_checks(combined_df, ors_df_ordered, my_join = i_join) -merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) +merging_cols_m6 = detect_common_cols(combined_df, ors_df_ordered) + +# dtype problems +if len(merging_cols_m6) > 1 and 'position'in merging_cols_m6: + print('Removing \'position\' from merging_cols_m6 to make dtypes consistent' + , '\norig length of merging_cols_m6:', len(merging_cols_m6)) + merging_cols_m6.remove('position') + print('\nlength after removing:', len(merging_cols_m6)) + print('Dim of df1:', combined_df.shape , '\nDim of df2:', ors_df_ordered.shape , '\nNo. of merging_cols:', len(merging_cols_m6)) print('Checking mutations in the two dfs:' - , '\nmuts in df1 but NOT in df2:' + , '\nmuts in df1 present in df2:' , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() - , '\nmuts in df2 but NOT in df1:' + , '\nmuts in df2 present in df1:' , ors_df_ordered['mutationinformation'].isin(combined_df['mutationinformation']).sum()) -#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df_ordered['mutationinformation']) ) - -combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m6, how = l_join) +#---------- +# merge 6 +#---------- +combined_df_all = pd.merge(combined_df, ors_df_ordered, on = merging_cols_m6, how = o_join) combined_df_all.shape -# populate mut_info_f1 -combined_df_all['mut_info_f1'].isna().sum() -combined_df_all['mut_info_f1'] = combined_df_all['position'].astype(str) + combined_df_all['wild_type'] + '>' + combined_df_all['position'].astype(str) + combined_df_all['mutant_type'] -combined_df_all['mut_info_f1'].isna().sum() - -# populate mut_type -combined_df_all['mut_type'].isna().sum() -#mut_type_word = combined_df_all['mut_type'].value_counts() -mut_type_word = 'missense' # FIXME, should be derived -combined_df_all['mut_type'].fillna(mut_type_word, inplace = True) -combined_df_all['mut_type'].isna().sum() - -# populate gene_id -combined_df_all['gene_id'].isna().sum() -#gene_id_word = combined_df_all['gene_id'].value_counts() -gene_id_word = 'Rv2043c' # FIXME, should be derived -combined_df_all['gene_id'].fillna(gene_id_word, inplace = True) -combined_df_all['gene_id'].isna().sum() - -# populate gene_name -combined_df_all['gene_name'].isna().sum() -combined_df_all['gene_name'].value_counts() -combined_df_all['gene_name'].fillna(gene, inplace = True) -combined_df_all['gene_name'].isna().sum() - - -# FIXME: DIM -# only with left join! -outdf_expected_rows = len(combined_df) +# sanity check for merge 6 +outdf_expected_rows = len(combined_df) + extra_muts_myor +unique_muts = len(combined_df) outdf_expected_cols = len(combined_df.columns) + len(ors_df_ordered.columns) - len(merging_cols_m6) -#if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all.shape[0] == outdf_expected_rows: -if combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == outdf_expected_rows: +if combined_df_all.shape[0] == outdf_expected_rows and combined_df_all.shape[1] == outdf_expected_cols and combined_df_all['mutationinformation'].nunique() == unique_muts: print('PASS: Df dimension match' - , '\nDim of combined_df_all with join type:', l_join + , '\ncombined_df_all with join type:', o_join , '\n', combined_df_all.shape , '\n===============================================================') else: print('FAIL: Df dimension mismatch' , 'Cannot generate expected dim. See details of merge performed' , '\ndf1 dim:', combined_df.shape - , '\ndf2 dim:', ors_df.shape + , '\ndf2 dim:', ors_df_ordered.shape , '\nGot:', combined_df_all.shape , '\nmuts in df1 but NOT in df2:' - , combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum() + , combined_df['mutationinformation'].isin(ors_df_ordered['mutationinformation']).sum() , '\nmuts in df2 but NOT in df1:' , ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum()) sys.exit() - + +# drop extra cols +all_cols = combined_df_all.columns + +#pos_cols_check = combined_df_all[['position_x','position_y']] +c = combined_df_all[['position_x','position_y']].isna().sum() +pos_col_to_drop = c.index[c>0].to_list() +cols_to_drop = pos_col_to_drop + ['wild_type_kd'] + +print('Dropping', len(cols_to_drop), 'columns:\n', cols_to_drop) +combined_df_all.drop(cols_to_drop, axis = 1, inplace = True) + +# rename position_x to position +pos_col_to_rename = c.index[c==0].to_list() +combined_df_all.shape +combined_df_all.rename(columns = { pos_col_to_rename[0]: 'position'}, inplace = True) +combined_df_all.shape + +all_cols = combined_df_all.columns + +#%% reorder cols to for convenience +first_cols = ['mutationinformation','mutation', 'wild_type', 'position', 'mutant_type'] +last_cols = [col for col in combined_df_all.columns if col not in first_cols] + +df = combined_df_all[first_cols+last_cols] #%% IMPORTANT: check if mutation related info is all populated after this merge -# FIXME: should get fixed with JP's resolved dataset!? -check_nan = combined_df_all.isna().sum(axis = 0) +# select string colnames to ensure no NA exist there +string_cols = combined_df_all.columns[combined_df_all.applymap(lambda x: isinstance(x, str)).all(0)] + +if (combined_df_all[string_cols].isna().sum(axis = 0)== 0).all(): + print('PASS: All string cols are populated with no NAs') +else: + print('FAIL: NAs detected in string cols') + print(combined_df_all[string_cols].isna().sum(axis = 0)) + sys.exit() + # relevant mut cols check_mut_cols = merging_cols_m5 + merging_cols_m6 count_na_mut_cols = combined_df_all[check_mut_cols].isna().sum().reset_index().rename(columns = {'index': 'col_name', 0: 'na_count'}) +print(check_mut_cols) if (count_na_mut_cols['na_count'].sum() > 0).any(): # FIXME: static override, generate 'mutation' from variable @@ -434,31 +526,29 @@ else: print('No missing \'mutation\' has been detected!') -lookup_dict = dict() -for k, v in oneletter_aa_dict.items(): - lookup_dict[k] = v['three_letter_code_lower'] - print(lookup_dict) - wt_3let = combined_df_all['wild_type'].map(lookup_dict) - #print(wt_3let) - pos = combined_df_all['position'].astype(str) - #print(pos) - mt_3let = combined_df_all['mutant_type'].map(lookup_dict) - #print(mt_3let) - # override the 'mutation' column - combined_df_all['mutation'] = 'pnca_p.' + wt_3let + pos + mt_3let - print(combined_df_all['mutation']) -# populate mut_info_f2 -combined_df_all['mut_info_f2'] = combined_df_all['mutation'].str.replace(gene_match.lower(), 'p.', regex = True) +#cols_to_drop = ['reference_allele', 'alternate_allele', 'symbol' ] +col_to_drop = ['wild_type_kd'] +print('Dropping', len(col_to_drop), 'columns:\n' + , col_to_drop) +combined_df_all.drop(col_to_drop, axis = 1, inplace = True) + + #%% check #cols_check = check_mut_cols + ['mut_info_f1', 'mut_info_f2'] #foo = combined_df_all[cols_check] - +foo = combined_df_all.drop_duplicates('mutationinformation') +foo2 = combined_df_all.drop_duplicates('mutation') +poo = combined_df_all[combined_df_all['mutation'].isna()] #%%============================================================================ output_cols = combined_df_all.columns #print('Output cols:', output_cols) - +#%% drop duplicates +if combined_df_all.shape[0] != outdf_expected_rows: + print('INFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele') +else: + print('combined_df_all has no duplicate muts present') #%%============================================================================ # write csv print('Writing file: combined output of all params needed for plotting and ML') diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 3f72b1b..f16e136 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -81,16 +81,16 @@ gene = args.gene gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) -nssnp_match = gene_match +'[A-Z]{3}[0-9]+[A-Z]{3}' +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' print('nsSNP for gene', gene, ':', nssnp_match) -wt_regex = gene_match.lower()+'(\w{3})' +wt_regex = gene_match.lower()+'([A-Za-z]{3})' print('wt regex:', wt_regex) -mut_regex = r'\d+(\w{3})$' +mut_regex = r'[0-9]+(\w{3})$' print('mt regex:', mut_regex) -pos_regex = r'(\d+)' +pos_regex = r'([0-9]+)' print('position regex:', pos_regex) # building cols to extract @@ -154,30 +154,29 @@ if in_filename_master == 'original_tanushree_data_v2.csv': else: core_cols = ['id' , 'sample' - , 'patient_id' - , 'strain' + #, 'patient_id' + #, 'strain' , 'lineage' , 'sublineage' - , 'country' + #, 'country' , 'country_code' , 'geographic_source' #, 'region' - , 'location' - , 'host_body_site' - , 'environment_material' - , 'host_status' - , 'host_sex' - , 'submitted_host_sex' - , 'hiv_status' - , 'HIV_status' - , 'tissue_type' - , 'isolation_source' + #, 'location' + #, 'host_body_site' + #, 'environment_material' + #, 'host_status' + #, 'host_sex' + #, 'submitted_host_sex' + #, 'hiv_status' + #, 'HIV_status' + #, 'tissue_type' + #, 'isolation_source' , resistance_col] variable_based_cols = [drug , dr_muts_col , other_muts_col] - #, resistance_col] cols_to_extract = core_cols + variable_based_cols print('Extracting', len(cols_to_extract), 'columns from master data') @@ -200,7 +199,7 @@ print('No. of NAs/column:' + '\n', meta_data.isna().sum() #%% Write check file check_file = outdir + '/' + gene.lower() + '_gwas.csv' -meta_data.to_csv(check_file) +meta_data.to_csv(check_file, index = False) print('Writing subsetted gwas data' , '\nFile', check_file , '\nDim:', meta_data.shape) @@ -215,9 +214,9 @@ print('Writing subsetted gwas data' # drug counts: complete samples for OR calcs meta_data[drug].value_counts() print('===========================================================\n' - , 'RESULT: No. of Sus and Res samples:\n', meta_data[drug].value_counts() + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() , '\n===========================================================\n' - , 'RESULT: Percentage of Sus and Res samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 , '\n===========================================================') #%% diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 7d45d1b..38474d9 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -58,6 +58,20 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u +# quick checks +head(my_df_u[, c("mutation", "mutation2")]) + +cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") +foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] + + +which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)) + + +baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) +colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") + + # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") @@ -94,6 +108,7 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) + # counting NAs in AF, OR cols: if (identical(sum(is.na(my_df_u$or_mychisq)) , sum(is.na(my_df_u$pval_fisher)) @@ -230,9 +245,9 @@ if (identical(sum(is.na(merged_df3$or_kin)) if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin))) && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){ - cat('PASS: Indices match for mychisq and kin ors missing values') + cat("PASS: Indices match for mychisq and kin ors missing values") } else{ - cat('Index mismatch: mychisq and kin ors missing indices match') + cat("Index mismatch: mychisq and kin ors missing indices match") quit() } @@ -245,7 +260,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nfilename: merged_df2_comp") if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ - print('mychisq and kin ors missing indices match. Procedding with omitting NAs') + print("mychisq and kin ors missing indices match. Procedding with omitting NAs") na_count_df2 = sum(is.na(merged_df2$af)) merged_df2_comp = merged_df2[!is.na(merged_df2$af),] # sanity check: no +-1 gymnastics @@ -262,7 +277,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ ,"\nGot no. of rows: ", nrow(merged_df2_comp)) } }else{ - print('Index mismatch for mychisq and kin ors. Aborting NA ommission') + print("Index mismatch for mychisq and kin ors. Aborting NA ommission") } #========================= @@ -272,7 +287,7 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ #========================= if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ - print('mychisq and kin ors missing indices match. Procedding with omitting NAs') + print("mychisq and kin ors missing indices match. Procedding with omitting NAs") na_count_df3 = sum(is.na(merged_df3$af)) #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way @@ -289,7 +304,7 @@ if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ ,"\nGot no. of rows: ", nrow(merged_df3_comp)) } } else{ - print('Index mismatch for mychisq and kin ors. Aborting NA ommission') + print("Index mismatch for mychisq and kin ors. Aborting NA ommission") } # alternate way of deriving merged_df3_comp @@ -347,7 +362,7 @@ merged_df3_comp_lig = merged_df3_comp[merged_df3_comp$ligand_distance<10,] if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){ print("PASS: verified merged_df3_lig") }else{ - cat(paste0('FAIL: nrow mismatch for merged_df3_lig' + cat(paste0("FAIL: nrow mismatch for merged_df3_lig" , "\nExpected:", nrow(my_df_u_lig) , "\nGot:", nrow(merged_df3_lig))) }