From ac0d14e1164fd5224d8023b506c522d5d41c1402 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 26 Apr 2022 17:03:40 +0100 Subject: [PATCH] done but getting an error when running from cmd --- scripts/data_extraction.py | 716 +++++++++++++++++++++++++++---------- 1 file changed, 521 insertions(+), 195 deletions(-) mode change 100644 => 100755 scripts/data_extraction.py diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py old mode 100644 new mode 100755 index b6f6411..b064ec1 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -314,7 +314,7 @@ print('===========================================================\n' #meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ] # so downstream code doesn't change meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ] -#%% DF: with dr_muts_col +#%% DF: with dr_muts_col, meta_gene_dr meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] @@ -327,7 +327,7 @@ else: , '\nshape of df2:', meta_gene_dr1.shape , '\nCheck again!') ############################################################################## -#%% DF: with other_muts_col +#%% DF: with other_muts_col, other_gene_dr meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] @@ -365,11 +365,11 @@ print('===========================================================\n' , '\n===========================================================') ############################################################################### #%% Create a copy of indices for downstream mergeing -meta_gene_all['index_orig'] = meta_gene_all.index -meta_gene_all['index_orig_copy'] = meta_gene_all.index +#meta_gene_all['index_orig'] = meta_gene_all.index +#meta_gene_all['index_orig_copy'] = meta_gene_all.index -all(meta_gene_all.index.values == meta_gene_all['index_orig'].values) -all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values) +#all(meta_gene_all.index.values == meta_gene_all['index_orig'].values) +#all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values) ############################################################################## #%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc. # Split based on semi colon dr_muts_col @@ -456,13 +456,13 @@ del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) # DF1: dr_muts_col # and remove leading white spaces #========= -col_to_split1 = dr_muts_col +#col_to_split1 = dr_muts_col print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape - , '\ncolumn name to apply tidy_split():' , col_to_split1 + , '\ncolumn name to apply tidy_split():' , dr_muts_col , '\n============================================================') # apply tidy_split() -dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') +dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';') # remove leading white space else these are counted as distinct mutations as well dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() @@ -579,13 +579,13 @@ del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt ) # DF2: other_muts_col # and remove leading white spaces #========= -col_to_split2 = other_muts_col +#col_to_split2 = other_muts_col print ('applying second tidy split() separately on other muts df', meta_gene_other.shape - , '\ncolumn name to apply tidy_split():', col_to_split2 + , '\ncolumn name to apply tidy_split():', other_muts_col , '\n============================================================') # apply tidy_split() -other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') +other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';') # remove the leading white spaces in the column other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() @@ -644,7 +644,7 @@ print('\n===================================================================' , '\nPredicting total no. of rows in the curated df:', expected_rows , '\n===================================================================') -#%%another way: Add value checks for dict so you can know if its correct for LF data count below +#%% another way: Add value checks for dict so you can know if its correct for LF data count below dr_snps_vc_dict = pd.Series(dr_gene_mutsL).value_counts().to_dict() other_snps_vc_dict = pd.Series(other_gene_mutsL).value_counts().to_dict() @@ -683,7 +683,7 @@ SnpFDict = lower_dict(SnpFDict_orig) ############################################################################### # USE Vcounts to get expected curated df # resolve dm om muts funda -#%%#%% Concatenating two dfs: gene_LF0 +#%% Concatenating two dfs: gene_LF0 # equivalent of rbind in R #========== # Concatentating the two dfs: equivalent of rbind in R @@ -922,13 +922,20 @@ ambig_muts_rev_df = pd.DataFrame() changes_val = [] changes_dict = {} -for i in common_muts: - #print(i) +##BROKENNNN!!!! + +common_muts_lower = list((map(lambda x: x.lower(), common_muts))) +common_muts_lower +##BROKENNNN!!!! +#for i in common_muts: +for i in common_muts_lower: + + print(i) temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']] - + temp_df # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() - + max_info_table revised_label = max_info_table[[0]].index[0][1] # max value_count old_label = max_info_table[[1]].index[0][1] # min value_count print('\nAmbiguous mutation labels...' @@ -948,12 +955,12 @@ ambig_muts_rev_df['mutation_info_orig'].value_counts() changes_val changes_total = sum(changes_val) changes_dict -#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts -#=================== +#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts +#================== # ambiguous muts -#======================= -#dr_muts.to_csv('dr_muts.csv', header = True) -#other_muts.to_csv('other_muts.csv', header = True) +#================== +#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True) +#other_muts.XXXX_csvXXX('other_muts.csv', header = True) if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts @@ -972,19 +979,20 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: , '\n=============================================================') del(out_filename_ambig_muts) -#%% FIXME: ambig mut counts -#======================= +#%% OUTFILE 2, write file: ambiguous mut counts +#====================== # ambiguous mut counts -#======================= +#====================== out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts print('\n----------------------------------' - , '\nWriting file: ambiguous muts' + , '\nWriting file: ambiguous mut counts' , '\n----------------------------------' , '\nFilename:', outfile_ambig_mut_counts) ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) -#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks +#%% FIXME: Add sanity check to make sure you can add value_count checks +print('\nREACHED here...................>>>>>>>') #%% Resolving ambiguous muts # Merging ambiguous muts #================= @@ -995,6 +1003,12 @@ ambig_muts_rev_df.index gene_LF1.index all(ambig_muts_rev_df.index.isin(gene_LF1.index)) +any(gene_LF1.index.isin(ambig_muts_rev_df.index)) +if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)): + print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...') +else: + sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts') + #gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] @@ -1024,9 +1038,9 @@ gene_LF1['mutation_info'].value_counts() #%% PHEW! Good to go for downstream stuff #%% Add column: Mutationinformation # splitting mutation column to get mCSM style muts -#======================= +#===================================================== # Formatting df: read aa dict and pull relevant info -#======================= +#===================================================== print('Now some more formatting:' , '\nRead aa dict and pull relevant info' , '\nFormat mutations:' @@ -1040,12 +1054,13 @@ print('Now some more formatting:' ncol_mutf_add = 3 # mut split into 3 cols ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping -#=========== +#====================================================================== # Split 'mutation' column into three: wild_type, position and # mutant_type separately. Then map three letter code to one using # reference_dict imported at the beginning. -# After importing, convert to mutation to lowercase for compatibility with dict -#=========== +# After importing, convert to mutation to lowercase for +# compatibility with dict +#======================================================================= gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() print('wt regex being used:', wt_regex @@ -1054,14 +1069,15 @@ print('wt regex being used:', wt_regex mylen0 = len(gene_LF1.columns) -#======= +#========================================================= # Iterate through the dict, create a lookup dict i.e # lookup_dict = {three_letter_code: one_letter_code}. # lookup dict should be the key and the value (you want to create a column for) # Then use this to perform the mapping separetly for wild type and mutant cols. -# The three letter code is extracted using a string match match from the dataframe and then converted +# The three letter code is extracted using a string match match from the +# dataframe and then converted # to 'pandas series'since map only works in pandas series -#======= +#=========================================================== print('Adding', ncol_mutf_add, 'more cols:\n') # initialise a sub dict that is lookup dict for three letter code to 1-letter code @@ -1076,7 +1092,11 @@ for k, v in my_aa_dict.items(): mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() gene_LF1['mutant_type'] = mut.map(lookup_dict) -# extract position info from mutation column separetly using string match +#------------------- +# extract position +# info from mutation +# column separetly using string match +#------------------- #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) @@ -1109,11 +1129,11 @@ else: # clear variables del(k, v, wt, mut, lookup_dict) -######## +########################################################################## # combine the wild_type+poistion+mutant_type columns to generate # mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation -######### +########################################################################### gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] print('Created column: mutationinformation' , '\n=====================================================================\n' @@ -1138,7 +1158,6 @@ bar = gene_LF1['position'].value_counts() #else: # print('FAIL: df could not be ordered. Check source') # sys.exit() - #%% Create a copy of mutationinformation column for downstream mergeing gene_LF1['Mut'] = gene_LF1['mutationinformation'] gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] @@ -1150,7 +1169,7 @@ gene_LF1['index_orig_copy'] = gene_LF1.index all(gene_LF1.index.values == gene_LF1['index_orig'].values) all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values) -#%% quick sanity check for position freq count +#%% Quick sanity check for position freq count # count the freq of 'other muts' samples test_df = gene_LF1.copy() test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']] @@ -1158,6 +1177,7 @@ test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'positi #test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count') #print('Revised dim of other_muts_df:',test_df.shape) test_df['scount'] = test_df['mutation'].map(SnpFDict) +test_df = test_df.sort_values(['position', 'mutationinformation']) #%% Map mutation frequency count as a column gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict) #%% Map position frequency count as a column @@ -1166,75 +1186,151 @@ z1 = z.to_dict() gene_LF1['pos_count'] = gene_LF1['position'].map(z1) #test_df2 = test_df.loc[test_df['position'] == 10] - -#%% OUTFILE 4, write file mCSM style muts -snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique()) -snps_only.head() -# assign column name -snps_only.columns = ['mutationinformation'] - -# count how many positions this corresponds to -pos_only = pd.DataFrame(gene_LF1['position'].unique()) -pos_only - -print('Checking NA in snps...')# should be 0 -if snps_only.mutationinformation.isna().sum() == 0: - print ('PASS: NO NAs/missing entries for SNPs' - , '\n===============================================================') -else: - sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') - -# write file: mCSM muts -out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv' -outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps - -print('\n----------------------------------' - , '\nWriting file: mCSM style muts' - , '\n----------------------------------' - , '\nFile:', outfile_mcsmsnps - , '\nmutation format (SNP): {WT}{MUT}' - , '\nNo. of distinct muts:', len(snps_only) - , '\nNo. of distinct positions:', len(pos_only) - , '\n=============================================================') - -snps_only.to_csv(outfile_mcsmsnps, header = False, index = False) - -print('Finished writing:', outfile_mcsmsnps - , '\nNo. of rows:', len(snps_only) - , '\nNo. of cols:', len(snps_only.columns) - , '\n=============================================================') -del(out_filename_mcsmsnps) - -#%% OUTFILE 5, write file frequency of position counts: MOVE TO THE END -metadata_pos = pd.DataFrame(gene_LF1['position']) -metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1) -metadata_pos['meta_pos_count'].value_counts() - -metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True) - -out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' -outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts -print('\n----------------------------------' - , '\nWriting file: Metadata poscounts' - , '\n----------------------------------' - , '\nFile:', outfile_metadata_poscounts - , '\n============================================================') - -metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False) -print('Finished writing:', outfile_metadata_poscounts - , '\nNo. of rows:', len(metadata_pos) - , '\nNo. of cols:', len(metadata_pos.columns) - , '\n=============================================================') -del(out_filename_metadata_poscounts) -#%% OUTFILE 6, write file _metadata: MOVE TO THE END -# ---THINK --- -#%% OUTFILE 7, write file MSA plots: MOVE TO THE END -# -- THINK --- -#%% OUTFILE 8, write file mutational position with count: MOVE TO THE END -# -- THINK --- +############################################################################### +############################################################################### #%% Add column: aa property_water +#========= +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_water} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_water'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_prop_water'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_prop_water'] = mut.map(lookup_dict) + +mylen2 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen2 == mylen1 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) + #%% Add column: aa_prop_polarity +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_prop_polarity} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +# initialise a sub dict that is lookup dict for three letter code to aa prop +# adding two more cols +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_prop_polarity'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict) + #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict) + +mylen3 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen3 == mylen2 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen1 + , '\nnew len:', mylen2) + +# clear variables +del(k, v, wt, mut, lookup_dict) #%% Add column: aa_calcprop +#======== +# iterate through the dict, create a lookup dict that i.e +# lookup_dict = {three_letter_code: aa_calcprop} +# Do this for both wild_type and mutant as above. +#========= +print('Adding', ncol_aa_add, 'more cols:\n') + +lookup_dict = dict() +for k, v in my_aa_dict.items(): + lookup_dict[k] = v['aa_calcprop'] + #print(lookup_dict) + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() + gene_LF1['wt_calcprop'] = wt.map(lookup_dict) + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() + gene_LF1['mut_calcprop'] = mut.map(lookup_dict) + +mylen4 = len(gene_LF1.columns) + +# sanity checks +print('checking if 3-letter wt&mut residue extraction worked correctly') +if wt.isna().sum() & mut.isna().sum() == 0: + print('PASS: 3-letter wt&mut residue extraction worked correctly:' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) +else: + print('FAIL: 3-letter wt&mut residue extraction failed' + , '\nNo NAs detected:' + , '\nwild-type\n', wt + , '\nmutant-type\n', mut + , '\ndim of df:', gene_LF1.shape) + +if mylen4 == mylen3 + ncol_aa_add: + print('PASS: successfully added', ncol_aa_add, 'cols' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) +else: + print('FAIL: failed to add cols:' + , '\nold length:', mylen3 + , '\nnew len:', mylen4) + +# clear variables +del(k, v, wt, mut, lookup_dict) + #%% NEW mappings: gene_LF2 # gene_LF2: copy gene_LF1 gene_LF2 = gene_LF1.copy() @@ -1267,6 +1363,8 @@ gene_LF2['total_id_ucount'] = total_id_ucount # gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) # gene_LF2['mut_id_ucount'] +gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) + # gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount']) #%% AF for gene @@ -1275,7 +1373,7 @@ gene_LF2['total_id_ucount'] = total_id_ucount #================= gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount'] gene_LF2['maf'].head() - +############################################################################### #%% Further mappings: gene_LF3, with mutationinformation as INDEX gene_LF3 = gene_LF2.copy() @@ -1324,6 +1422,7 @@ gene_LF3['drtype_all_vals'].head() gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) gene_LF3['drtype_all_names'].head() + #--------------------------------- # Revised drtype: max(Multimode) #-------------------------------- @@ -1406,6 +1505,7 @@ gene_LF3['dst_multimode_all'] # Fill using multimode ONLY where NA in dst_multimode column gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) gene_LF3['dst_multimode'] +gene_LF3['dst_multimode'].value_counts() #---------------------------------- # Revised dst column: max of mode @@ -1422,6 +1522,11 @@ gene_LF3[drug].value_counts() #gene_LF3['dst_noNA'].value_counts() gene_LF3['dst_mode'].value_counts() +if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]: + print('\nPASS: revised dst mode created successfully and the counts are more than col count') +else: + print('\nFAIL: revised dst mode numbers MISmatch') + foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] foo2 = foo.sort_values(['position', 'Mut']) @@ -1460,6 +1565,7 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la # Now overwrite gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) + if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()): print('\nPASS: Revised mutation_info column created successfully') gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1) @@ -1480,6 +1586,7 @@ gene_LF3['mutation_info_labels_orig'].value_counts() # %% sanity check for the revised dst gene_LF3[drug].value_counts() gene_LF3[drug].value_counts().sum() + gene_LF3['mutation_info_labels_orig'].value_counts() gene_LF3['dst_mode'].value_counts() @@ -1488,6 +1595,8 @@ gene_LF3['dst_mode'].value_counts().sum() # direct comparision gene_LF3['dst'].value_counts() gene_LF3['mutation_info_labels'].value_counts() +gene_LF3['mutation_info_labels_v1'].value_counts() + #%% Lineage gene_LF3['lineage'].value_counts() # lineage_label_numeric = {'lineage1' : 1 @@ -1509,24 +1618,52 @@ lineage_label_numeric = {'L1' : 1 , 'LBOV' : 8} lineage_label_numeric -# copy column to allow cross checks after stripping white space -gene_LF3['lineage'] = gene_LF3['lineage'].str.strip() -gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] -#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts()) -gene_LF3['lineage_corrupt'].value_counts() +#%% Lineage cols selected: gene_LF3_ColsSel +gene_LF3_ColsSel = gene_LF3.copy() +gene_LF3_ColsSel.index +gene_LF3_ColsSel.columns +#gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete -#%%tidy_split(): Lineage +# copy column to allow cross checks after stripping white space +gene_LF3_ColsSel['lineage'] = gene_LF3_ColsSel['lineage'].str.strip() +gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage'] +all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts()) +gene_LF3_ColsSel['lineage_corrupt'].value_counts() + +gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']] +gene_LF3_ColsSel.columns + +#%% tidy_split(): Lineage # Create df with tidy_split: lineage -lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') +lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';') lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip() lf_lin_split['lineage_corrupt'].value_counts() +#%% Important sanity check for tidy_split(): lineage +# Split based on semi colon dr_muts_col +search = ";" + +# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence +count_df_lin = gene_LF3_ColsSel[['lineage']] +count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I) +lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index() +lin_sc_C -# Map lineage labels to numbers to allow metrics +lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count'] +lin_sc_C +expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum() +expected_linC + +if (len(lf_lin_split) == expected_linC): + print('\nPASS: tidy_split() length match for lineage') +else: + sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count') +############################################################################### +#%% Map lineage labels to numbers to allow metrics lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric) lf_lin_split['lineage_numeric'].value_counts() #-------------------------------- -# Lineage_corrupt ALL values: +# Add lineage_list: ALL values: #-------------------------------- # Add all lineages for each mutation lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt'] @@ -1535,12 +1672,21 @@ lf_lin_split['lineage_corrupt_list'].value_counts() lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list) lf_lin_split['lineage_corrupt_list'].value_counts() +#-------------------------------- +# Add lineage count: ALL +#-------------------------------- +lf_lin_split['lineage_corrupt_count'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : len(list(x))) + +#-------------------------------- # Add lineage unique count +#-------------------------------- lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique()) #lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt'] lf_lin_split['lineage_corrupt_ucount'].value_counts() +#-------------------------------- # Add lineage_set +#-------------------------------- lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x))) lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x)) @@ -1551,95 +1697,275 @@ lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')[ lf_lin_split['lineage_multimode'].value_counts() # cant take max as it doesn't mean anyting! -foo = lf_lin_split[['Mut', 'lineage_ulist']] ############################################################################### -#%% Final merge: gene_LF4 with lineage_split_df -gene_LF4 = gene_LF3.copy() -gene_LF4.index -gene_LF4['index_orig_copy'] = gene_LF4['index_orig'] +#%% Select only the columns you want to merge from lf_lin_split +lf_lin_split.columns +lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count' + , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']] +lf_lin_split_ColSel.columns -lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig'] -#================================ -# Merge with gene_LF3 with -# lf_lin_split baseed on index -#================================ -# set index for lf_lin_split -lf_lin_split.index -lf_lin_split.reset_index(inplace=True) -lf_lin_split['mutationinformation'] -lf_lin_split = lf_lin_split.set_index(['index_orig_copy']) -lf_lin_split.index +lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all' + , 'lineage_corrupt_count' : 'lineage_count_all' + , 'lineage_corrupt_ucount' : 'lineage_count_unique' + , 'lineage_ulist' : 'lineage_list_unique' + , 'lineage_multimode' : 'lineage_multimode'}, inplace = True) -# set index for gene_LF4 -gene_LF4.index -gene_LF4.reset_index(inplace=True) -gene_LF4.index -gene_LF4['mutationinformation'] -gene_LF4['index_orig_copy'] -gene_LF4 = gene_LF4.set_index(['index_orig_copy']) -gene_LF4.index +lf_lin_split_ColSel.columns +ncols_to_merge = len(lf_lin_split_ColSel.columns) -#------------------------- -# colum lineage_ucount: -# contribution of each distinct lineage -#------------------------- -gene_LF4['lineage_ucount'] = gene_LF4['lineage'] +#%% Final merge: gene_LF3 with lf_lin_split_ColSel: gene_LF4 +#=============================== +# merge: inner join by default +# join: left join default +# concat: outer join by default +#df1.join(df2) +#=============================== +len(lf_lin_split) +len(gene_LF3) -#------------------------- -# colum lineage list: -#------------------------- -#gene_LF4['lineage_set'] = gene_LF4['lineage'] -gene_LF4['lineage_ulist'] = gene_LF4['lineage'] - -gene_LF4['lineage_list'] = gene_LF4['lineage'] - -#------------------------- -# colum lineage_list mode: -#------------------------- -gene_LF4['lineage_mode'] = gene_LF4['lineage'] - -# merge based on indices -gene_LF4.index.nunique() -lf_lin_split.index.nunique() -all(gene_LF4.index.isin(lf_lin_split.index)) -all(lf_lin_split.index.isin(gene_LF4.index)) -gene_LF4.index -lf_lin_split.index - -if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ): - print('\nPass: merging lineage_ucount with gene_LF4') +# Dropping duplicates from lineage df +lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()] +lf_lin_split_U = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()] +if len(lf_lin_split_U) == len(snps_only): + print('\nPASS: Duplicate entries removed from lf_lin' + , '\nReady to start the final merge to generate gene_LF4') else: - sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!') + print('\nFAIL: DF after duplicate removal numbers MISmatch ') + +gene_LF3.index +lf_lin_split_U.index -########################### -# magic merge happens here -########################### -# FIXME: add check here to see if the duplicated indices rows are actual duplicates as the cols I need should be summary cols -lf_lin_split.index.drop_duplicates(keep='first') -lf_lin_split = lf_lin_split -lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')] -lf_lin_split_U.shape +#-------------------- +# Creating gene_LF4 +#-------------------- +if all(gene_LF3.index.isin(lf_lin_split_U.index)) : + print('\nIndices match, staring final merge...') + gene_LF4 = gene_LF3.join(lf_lin_split_U) + if len(gene_LF4.columns) == len(gene_LF3.columns) + ncols_to_merge: + print('\nPASS: gene_LF4 created after successfully merging with lineage info' + , '\nShape of gene_LF4:', gene_LF4.shape) +else: + sys.exit('\nFAIL: gene_LF4 could not be created. Dfs could not be merged. Check indices or dfs length again!') -gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount'] -gene_LF4['lineage_ucount'].value_counts() +#---------------------------------------- +# Dropping redundant cols from gene_LF4 +#---------------------------------------- +if all(gene_LF4.index == gene_LF4['Mut']) and gene_LF4['index_orig'].equals(gene_LF4['index_orig_copy']): + gene_LF4.reset_index(inplace=True) + #gene_LF4 = gene_LF4.set_index(['index_orig']) + print('\nPass Mutationinformationa and index columns checked, the duplicates can now be dropped') + gene_LF4 = gene_LF4.drop(['Mut_copy', 'index_orig_copy', 'mutation_info_v1' ], axis = 1) -#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set'] -#gene_LF4['lineage_set'].value_counts() -gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist'] -gene_LF4['lineage_ulist'].value_counts() +#%% Final output with relevant columns +print('\nFinal output file: Dim of gene_LF4:', gene_LF4.shape) -gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list'] -gene_LF4['lineage_list'].value_counts() +# cols_to_output = ['mutationinformation', 'id', 'sample', 'lineage', 'sublineage', +# 'country_code', 'drtype', 'pyrazinamide', 'mutation', 'drug_name', +# 'mutation_info', 'mutation_info_orig', 'wild_type', 'mutant_type', +# 'position', 'Mut', 'index_orig', 'pos_count', 'total_id_ucount', +# 'snp_frequency', 'maf', 'drtype_numeric', 'drtype_all_vals', +# 'drtype_all_names', 'drtype_multimode', 'drtype_mode', 'drtype_max', +# 'mutation_info_labels', 'dm_om_numeric', 'dm_om_numeric_orig', 'dst', +# 'dst_multimode', 'dst_multimode_all', 'dst_mode', 'lineage_list_all', +# 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique', +# 'lineage_multimode'] +#%% OUTFILE 3, write file mCSM style muts +snps_only = pd.DataFrame(gene_LF4['mutationinformation'].unique()) +snps_only.head() +# assign column name +snps_only.columns = ['mutationinformation'] -gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode'] -gene_LF4['lineage_mode'].value_counts() +# count how many positions this corresponds to +pos_only = pd.DataFrame(gene_LF4['position'].unique()) +pos_only -foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount' - #, 'lineage_set' - , 'lineage_ulist' - , 'lineage_mode' - , 'lineage_list']] -#%% +print('Checking NA in snps...')# should be 0 +if snps_only.mutationinformation.isna().sum() == 0: + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') +else: + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') -#Subset relevant columns for output and put the rest of the output here \ No newline at end of file +# write file: mCSM muts +out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv' +outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps + +print('\n----------------------------------' + , '\nWriting file: mCSM style muts' + , '\n----------------------------------' + , '\nFile:', outfile_mcsmsnps + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo. of distinct muts:', len(snps_only) + , '\nNo. of distinct positions:', len(pos_only) + , '\n=============================================================') + +snps_only.to_csv(outfile_mcsmsnps, header = False, index = False) + +print('Finished writing:', outfile_mcsmsnps + , '\nNo. of rows:', len(snps_only) + , '\nNo. of cols:', len(snps_only.columns) + , '\n=============================================================') +del(out_filename_mcsmsnps) +############################################################################### +#%% OUTFILE 4, write file frequency of position count +metadata_pos = pd.DataFrame(gene_LF4['position']) +metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1) +metadata_pos['meta_pos_count'].value_counts() + +metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True) + +out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' +outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts +print('\n----------------------------------' + , '\nWriting file: Metadata poscounts' + , '\n----------------------------------' + , '\nFile:', outfile_metadata_poscounts + , '\n============================================================') + +metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False) +print('Finished writing:', outfile_metadata_poscounts + , '\nNo. of rows:', len(metadata_pos) + , '\nNo. of cols:', len(metadata_pos.columns) + , '\n=============================================================') +del(out_filename_metadata_poscounts) +############################################################################### +#%% OUTFILE 5, write file _metadata +# where each row has UNIQUE mutations NOT unique sample ids +out_filename_metadata = gene.lower() + '_metadata.csv' +outfile_metadata = outdir + '/' + out_filename_metadata +print('\n----------------------------------' + , '\nWriting file: LF formatted data' + , '\n----------------------------------' + , '\nFile:', outfile_metadata + , '\n============================================================') + +gene_LF4.to_csv(outfile_metadata, header = True, index = False) +print('Finished writing:', outfile_metadata + , '\nNo. of rows:', len(gene_LF4) + , '\nNo. of cols:', len(gene_LF4.columns) + , '\n=============================================================') +del(out_filename_metadata) +############################################################################### +#%% OUTFILE 6, write file MSA plots +#write file: mCSM style but with repitions for MSA and logo plots +all_muts_msa = pd.DataFrame(gene_LF4['mutationinformation']) +all_muts_msa.head() + +# assign column name +all_muts_msa.columns = ['mutationinformation'] + +# make sure it is string +all_muts_msa.columns.dtype + +# sort the column +all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation') + +# create an extra column with protein name +#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') +#all_muts_msa_sorted.head() + +# rearrange columns so the fasta name is the first column (required for mutate.script) +#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']] +all_muts_msa_sorted.head() + +print('Checking NA in snps...')# should be 0 +if all_muts_msa.mutationinformation.isna().sum() == 0: + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') +else: + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') + +out_filename_msa = gene.lower() +'_all_muts_msa.csv' +outfile_msa = outdir + '/' + out_filename_msa + +print('\n----------------------------------' + , '\nWriting file: mCSM style muts for msa' + , '\n----------------------------------' + , '\nFile:', outfile_msa + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo.of lines of msa:', len(all_muts_msa)) + +all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) + +print('Finished writing:', outfile_msa + , '\nNo. of rows:', len(all_muts_msa) + , '\nNo. of cols:', len(all_muts_msa.columns) + , '\n=============================================================') + +del(out_filename_msa) +############################################################################### +#%% OUTFILE 7, write file mutational position with count +# write file for mutational positions +# count how many positions this corresponds to +pos_only = pd.DataFrame(gene_LF4['position'].unique()) +# assign column name +pos_only.columns = ['position'] +# make sure dtype of column position is int or numeric and not string +pos_only.position.dtype +pos_only['position'] = pos_only['position'].astype(int) +pos_only.position.dtype + +# sort by position value +pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) + +out_filename_pos = gene.lower() + '_mutational_positons.csv' +outfile_pos = outdir + '/' + out_filename_pos + +print('\n----------------------------------' + , '\nWriting file: mutational positions' + , '\n----------------------------------' + , '\nFile:', outfile_pos + , '\nNo. of distinct positions:', len(pos_only_sorted) + , '\n=============================================================') + +pos_only_sorted.to_csv(outfile_pos, header = True, index = False) + +print('Finished writing:', outfile_pos + , '\nNo. of rows:', len(pos_only_sorted) + , '\nNo. of cols:', len(pos_only_sorted.columns) + , '\n=============================================================' + , '\n\n\n') + +del(out_filename_pos) +############################################################################### +#%% Quick summary output +print('============================================' + , '\nQuick summary output for', drug, 'and' , gene.lower() + , '\n============================================' + , '\nTotal samples:', total_samples + , '\nNo. of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts() + , '\n' + , '\nPercentage of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , '\n' + + , '\nNo. of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts() + , '\n' + , '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100 + , '\n' + + , '\nTotal no. of unique snps:', len(snps_only) + , '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique() + + , '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD + , '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD + + + , '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique() + , '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique() + , '\nTotal no. of samples with missense muts:', len(gene_LF4) + , '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique() + , '\n') + +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('\nTotal no.of samples with ambiguous muts:', len(inspect) + #, '\nTotal no.of unique ambiguous muts:', len(common_muts) + , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() + , '\n=============================================================' + , '\nPost resolving ambiguity\n' + , ambig_muts_rev_df['mutation_info_REV'].value_counts()) +#======================================================================= +print(u'\u2698' * 50, + '\nEnd of script: Data extraction and writing files' + '\n' + u'\u2698' * 50 ) +#%% end of script \ No newline at end of file