From 0eaff731141c8b474c149b0c1cbf000f23906b44 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 26 Mar 2020 17:58:39 +0000 Subject: [PATCH] tidy code and saving work for the day --- meta_data_analysis/data_extraction.py | 465 +++++++++--------- meta_data_analysis/dssp_df.py | 16 +- meta_data_analysis/kd_df.py | 36 +- .../mut_electrostatic_changes.py | 28 +- meta_data_analysis/rd_df.py | 43 +- meta_data_analysis/reference_dict.py | 3 +- 6 files changed, 307 insertions(+), 284 deletions(-) diff --git a/meta_data_analysis/data_extraction.py b/meta_data_analysis/data_extraction.py index 67a8962..70f3008 100755 --- a/meta_data_analysis/data_extraction.py +++ b/meta_data_analysis/data_extraction.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" +''' Created on Tue Aug 6 12:56:03 2019 @author: tanu -""" +''' # FIXME: include error checking to enure you only # concentrate on positions that have structural info? @@ -67,7 +67,8 @@ datadir = homedir + '/' + 'git/Data' in_filename = 'original_tanushree_data_v2.csv' infile = datadir + '/' + in_filename print('Input filename: ', in_filename - , '\nInput path: ', indir) + , '\nInput path: ', indir + , '\n============================================================') #======= # output @@ -76,7 +77,8 @@ print('Input filename: ', in_filename # output filenames in respective sections at the time of outputting files outdir = datadir + '/' + drug + '/' + 'output' print('Output filename: in the respective sections' - , '\nOutput path: ', outdir) + , '\nOutput path: ', outdir + , '\n=============================================================') #%%end of variable assignment for input and output files #======================================================================= @@ -101,21 +103,22 @@ del(master_data) # checks and results total_samples = meta_data['id'].nunique() -print('RESULT: Total samples:', total_samples) -print('======================================================================') +print('RESULT: Total samples:', total_samples + , '\n===========================================================') # counts NA per column meta_data.isna().sum() -print('No. of NAs/column:' + '\n', meta_data.isna().sum()) -print('======================================================================') +print('No. of NAs/column:' + '\n', meta_data.isna().sum() + , '\n===========================================================') + # glance meta_data.head() # equivalent of table in R # pyrazinamide counts meta_data.pyrazinamide.value_counts() -print('RESULT: Sus and Res samples:\n', meta_data.pyrazinamide.value_counts()) -print('======================================================================') +print('RESULT: Sus and Res samples:\n', meta_data.pyrazinamide.value_counts() + , '\n===========================================================') # clear variables del(indir, in_filename,infile) @@ -140,11 +143,12 @@ clean_df = meta_data.dropna(subset=['dr_mutations_pyrazinamide']) na_count = meta_data['dr_mutations_pyrazinamide'].isna().sum() if len(clean_df) == (total_samples - na_count): - print('PASS: clean_df extracted: length is', len(clean_df), - '\nNo.of NA s=', na_count, '/', total_samples) + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NAs =', na_count, '/', total_samples + , '\n==========================================================') else: - print('FAIL: dropping NA failed') -print('======================================================================') + print('FAIL: dropping NA failed' + , '\n==========================================================') dr_pnca_count = 0 wt = 0 @@ -169,7 +173,7 @@ print('RESULTS:') print('Total WT in dr_mutations_pyrazinamide:', wt) print('Total matches of', gene_match, 'in dr_mutations_pyrazinamide:', dr_pnca_count) print('Total samples with > 1', gene_match, 'muts in dr_mutations_pyrazinamide:', len(id2_dr) ) -print('======================================================================') +print('=================================================================') del(i, id, wt, id2_dr, clean_df, na_count, count_pnca_dr, count_wt) @@ -185,11 +189,12 @@ clean_df = meta_data.dropna(subset=['other_mutations_pyrazinamide']) na_count = meta_data['other_mutations_pyrazinamide'].isna().sum() if len(clean_df) == (total_samples - na_count): - print('PASS: clean_df extracted: length is', len(clean_df), - '\nNo.of NA s=', na_count, '/', total_samples) + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NA s=', na_count, '/', total_samples + , '\n=========================================================') else: - print('FAIL: dropping NA failed') -print('======================================================================') + print('FAIL: dropping NA failed' + , '\n=========================================================') other_pnca_count = 0 wt_other = 0 @@ -213,7 +218,7 @@ print('RESULTS:') print('Total WT in other_mutations_pyrazinamide:', wt_other) print('Total matches of', gene_match, 'in other_mutations_pyrazinamide:', other_pnca_count) print('Total samples with > 1', gene_match, 'muts in other_mutations_pyrazinamide:', len(id2_other) ) -print('======================================================================') +print('=================================================================') print('Predicting total no. of rows in your curated df:', dr_pnca_count + other_pnca_count ) expected_rows = dr_pnca_count + other_pnca_count @@ -224,7 +229,7 @@ del(i, id, wt_other, clean_df, na_count, id2_other, count_pnca_other, count_wt) ############ # extracting dr and other muts separately along with the common cols ############# -print('======================================================================') +print('=================================================================') print('Extracting dr_muts in a dr_mutations_pyrazinamide with other meta_data') print('gene to extract:', gene_match ) @@ -241,9 +246,9 @@ meta_data_dr = meta_data[['id' , 'pyrazinamide' , 'dr_mutations_pyrazinamide' ]] -print("expected dim should be:", len(meta_data), (len(meta_data.columns)-1) ) -print("actual dim:", meta_data_dr.shape ) -print('======================================================================') +print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) +print('actual dim:', meta_data_dr.shape + , '\n===============================================================') # Extract within this the gene of interest using string match #meta_pnca_dr = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] @@ -256,17 +261,17 @@ print('checking RESULT:', '\nactual len =', len(meta_pnca_dr) ) if len(id_dr) == len(meta_pnca_dr): - print('PASS: lengths match') + print('PASS: lengths match' + , '\n===============================================================') else: - print('FAIL: length mismatch') -print('======================================================================') + print('FAIL: length mismatch' + , '\n===============================================================') dr_id = pd.Series(dr_id) #================= # other mutations: extract pncA_p. entries #================== -print('======================================================================') print('Extracting dr_muts in a other_mutations_pyrazinamide with other meta_data') # FIXME: replace pyrazinamide with variable containing the drug name # !!! important !!! @@ -279,9 +284,9 @@ meta_data_other = meta_data[['id' , 'other_mutations_pyrazinamide' ]] -print("expected dim should be:", len(meta_data), (len(meta_data.columns)-1) ) -print("actual dim:", meta_data_other.shape ) -print('======================================================================') +print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) +print('actual dim:', meta_data_other.shape + , '\n===============================================================') # Extract within this the gene of interest using string match meta_pnca_other = meta_data_other.loc[meta_data_other.other_mutations_pyrazinamide.str.contains(gene_match, na = False)] @@ -293,10 +298,11 @@ print('checking RESULT:', '\nactual len =', len(meta_pnca_other) ) if len(id_other) == len(meta_pnca_other): - print('PASS: lengths match') + print('PASS: lengths match' + , '\n==============================================================') else: - print('FAIL: length mismatch') -print('======================================================================') + print('FAIL: length mismatch' + , '\n===============================================================') other_id = pd.Series(other_id) #%% Find common IDs @@ -323,61 +329,61 @@ print(common_ids['id'].equals(common_ids2['id2'])) # good sanity check: use it later to check pnca_sample_counts expected_pnca_samples = ( len(meta_pnca_dr) + len(meta_pnca_other) - common_mut_ids ) -print("expected no. of pnca samples:", expected_pnca_samples) -print('======================================================================') +print('expected no. of pnca samples:', expected_pnca_samples) +print('=================================================================') #%% write file #print(outdir) out_filename0 = gene.lower() + '_' + 'common_ids.csv' outfile0 = outdir + '/' + out_filename0 #FIXME: CHECK line len(common_ids) -print('Writing file: common ids:', - '\nFilename:', out_filename0, - '\nPath:', outdir, - '\nExpected no. of rows:', len(common_ids) ) +print('Writing file: common ids:' + , '\nFilename:', out_filename0 + , '\nPath:', outdir + , '\nExpected no. of rows:', len(common_ids) + , '\n=============================================================') common_ids.to_csv(outfile0) -print('======================================================================') del(out_filename0) - # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) -#%% Now extract "all" pncA mutations: i.e 'pncA_p.*' -print("extracting all pncA mutations from dr_ and other_ cols using string match:", gene_match) +#%% Now extract 'all' pncA mutations: i.e 'pncA_p.*' +print('extracting all pncA mutations from dr_ and other_ cols using string match:', gene_match + , '\n===============================================================') #meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match) ] meta_pnca_all = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains(gene_match, na = False) | meta_data.other_mutations_pyrazinamide.str.contains(gene_match, na = False) ] -print('======================================================================') extracted_pnca_samples = meta_pnca_all['id'].nunique() -print("RESULT: actual no. of pnca samples extracted:", extracted_pnca_samples) +print('RESULT: actual no. of pnca samples extracted:', extracted_pnca_samples) print('======================================================================') # sanity check: length of pnca samples print('Performing sanity check:') if extracted_pnca_samples == expected_pnca_samples: - print('No. of pnca samples:', len(meta_pnca_all), - '\nPASS: expected & actual no. of pnca samples match') + print('No. of pnca samples:', len(meta_pnca_all) + , '\nPASS: expected & actual no. of pnca samples match' + , '\n=========================================================') else: - print("FAIL: Debug please!") -print('======================================================================') + print('FAIL: Debug please!' + , '\n===============================================================') # count NA in pyrazinamide column pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() -print("No. of pnca samples without pza testing i.e NA in pza column:",pnca_na) +print('No. of pnca samples without pza testing i.e NA in pza column:', pnca_na) # use it later to check number of complete samples from LF data comp_pnca_samples = len(meta_pnca_all) - pnca_na -print("comp pnca samples tested for pza:", comp_pnca_samples) -print('======================================================================') +print('comp pnca samples tested for pza:', comp_pnca_samples) +print('=================================================================') # Comment: This is still dirty data since these # are samples that have pncA_p. muts, but can have others as well # since the format for mutations is mut1; mut2, etc. -print('This is still dirty data: samples have pncA_p. muts, but may have others as well', - '\nsince the format for mutations is mut1; mut2, etc.') -print('======================================================================') +print('This is still dirty data: samples have pncA_p. muts, but may have others as well' + , '\nsince the format for mutations is mut1; mut2, etc.' + , '\n=============================================================') #%% tidy_split():Function to split mutations on specified delimiter: ';' #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas @@ -385,7 +391,7 @@ print('======================================================================') print('Performing tidy_spllit(): to separate the mutations into indivdual rows') # define the split function def tidy_split(df, column, sep='|', keep=False): - """ + ''' Split the values of a column and expand so the new DataFrame has one split value per row. Filters rows where the column is missing. @@ -404,7 +410,7 @@ def tidy_split(df, column, sep='|', keep=False): ------- pandas.DataFrame Returns a dataframe with the same columns as `df`. - """ + ''' indexes = list() new_values = list() #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case @@ -428,9 +434,9 @@ def tidy_split(df, column, sep='|', keep=False): # tidy_split(): on 'dr_mutations_pyrazinamide' column and remove leading white spaces ######## col_to_split1 = 'dr_mutations_pyrazinamide' -print ('Firstly, applying tidy split on dr df:', meta_pnca_dr.shape, - '\ncolumn name:', col_to_split1) -print('======================================================================') +print ('Firstly, applying tidy split on dr df:', meta_pnca_dr.shape + , '\ncolumn name:', col_to_split1 + , '\n============================================================') # apply tidy_split() dr_WF0 = tidy_split(meta_pnca_dr, col_to_split1, sep = ';') # remove leading white space else these are counted as distinct mutations as well @@ -446,43 +452,42 @@ print('lengths after tidy split and extracting', gene_match, 'muts:' , '\nexpected len:', dr_pnca_count) if len(dr_pnca_WF0) == dr_pnca_count: - print('PASS: length of dr_pnca_WF0 match with expected length') + print('PASS: length of dr_pnca_WF0 match with expected length' + , '\n===============================================================') else: - print('FAIL: lengths mismatch') + print('FAIL: lengths mismatch' + , '\n===============================================================') -print('======================================================================') - -# count the freq of "dr_muts" samples +# count the freq of 'dr_muts' samples dr_muts_df = dr_pnca_WF0 [['id', 'dr_mutations_pyrazinamide']] -print("dim of dr_muts_df:", dr_muts_df.shape) +print('dim of dr_muts_df:', dr_muts_df.shape) # add freq column dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') #dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count') -print("revised dim of dr_muts_df:", dr_muts_df.shape) +print('revised dim of dr_muts_df:', dr_muts_df.shape) c1 = dr_muts_df.dr_sample_freq.value_counts() print('counting no. of sample frequency:\n', c1) -print('======================================================================') +print('=================================================================') # sanity check: length of pnca samples if len(dr_pnca_WF0) == c1.sum(): - print('PASS: WF data has expected length', - '\nlength of dr_pnca WFO:', c1.sum() ) + print('PASS: WF data has expected length' + , '\nlength of dr_pnca WFO:', c1.sum() + , '\n===============================================================') else: - print("FAIL: Debug please!") - -print('======================================================================') + print('FAIL: Debug please!' + , '\n===============================================================') #!!! Important !!! -# Assign "column name" on which split was performed as an extra column +# Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df dr_df = dr_pnca_WF0.assign(mutation_info = 'dr_mutations_pyrazinamide') -print("dim of dr_df:", dr_df.shape) -print('======================================================================') -print('End of tidy split() on dr_muts, and added an extra column relecting mut_category') -print('======================================================================') - +print('dim of dr_df:', dr_df.shape + , '\n==============================================================' + , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' + , '\n===============================================================') #%% #========= # DF2: other_mutations_pyrazinamdie @@ -491,9 +496,9 @@ print('======================================================================') # tidy_split(): on 'other_mutations_pyrazinamide' column and remove leading white spaces ######## col_to_split2 = 'other_mutations_pyrazinamide' -print ("applying second tidy split separately on df:", meta_pnca_other.shape, '\n' - , "column name:", col_to_split2) -print('======================================================================') +print ('applying second tidy split separately on df:', meta_pnca_other.shape + , '\ncolumn name:', col_to_split2 + , '\n============================================================') # apply tidy_split() other_WF1 = tidy_split(meta_pnca_other, col_to_split2, sep = ';') @@ -510,45 +515,46 @@ print('lengths after tidy split and extracting', gene_match, 'muts:', '\nexpected len:', other_pnca_count) if len(other_pnca_WF1) == other_pnca_count: - print('PASS: length of dr_pnca_WF0 match with expected length') + print('PASS: length of dr_pnca_WF0 match with expected length + , '\n===============================================================') else: - print('FAIL: lengths mismatch') - -print('======================================================================') -# count the freq of "other muts" samples + print('FAIL: lengths mismatch + , '\n===============================================================') +# count the freq of 'other muts' samples other_muts_df = other_pnca_WF1 [['id', 'other_mutations_pyrazinamide']] -print("dim of other_muts_df:", other_muts_df.shape) +print('dim of other_muts_df:', other_muts_df.shape) # add freq column other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count') -print("revised dim of other_muts_df:", other_muts_df.shape) +print('revised dim of other_muts_df:', other_muts_df.shape) c2 = other_muts_df.other_sample_freq.value_counts() print('counting no. of sample frequency:\n', c2) -print('======================================================================') +print('=================================================================') # sanity check: length of pnca samples if len(other_pnca_WF1) == c2.sum(): - print('PASS: WF data has expected length', - '\nlength of other_pnca WFO:', c2.sum() ) + print('PASS: WF data has expected length' + , '\nlength of other_pnca WFO:', c2.sum() + , '\n===============================================================') else: - print("FAIL: Debug please!") -print('======================================================================') + print('FAIL: Debug please!' + , '\n===============================================================') #!!! Important !!! -# Assign "column name" on which split was performed as an extra column +# Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df other_df = other_pnca_WF1.assign(mutation_info = 'other_mutations_pyrazinamide') -print("dim of other_df:", other_df.shape) -print('======================================================================') -print('End of tidy split() on other_muts, and added an extra column relecting mut_category') -print('======================================================================') +print('dim of other_df:', other_df.shape + , '\n===============================================================' + , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' + , '\n===============================================================') #%% #========== # Concatentating the two dfs: equivalent of rbind in R #========== #!!! important !!! # change column names to allow concat: -# dr_muts.. & other_muts : "mutation" +# dr_muts.. & other_muts : 'mutation' print('Now concatenating the two dfs by row') dr_df.columns @@ -559,39 +565,40 @@ other_df.columns other_df.rename(columns = {'other_mutations_pyrazinamide': 'mutation'}, inplace = True) other_df.columns -print('======================================================================') -print('Now appending the two dfs:', - '\ndr_df dim:', dr_df.shape, - '\nother_df dim:', other_df.shape, - '\ndr_df length:', len(dr_df), - '\nother_df length:', len(other_df), - '\nexpected length:', len(dr_df) + len(other_df) ) -print('======================================================================') +print('=================================================================') +print('Now appending the two dfs:' + , '\ndr_df dim:', dr_df.shape + , '\nother_df dim:', other_df.shape + , '\ndr_df length:', len(dr_df) + , '\nother_df length:', len(other_df) + , '\nexpected length:', len(dr_df) + len(other_df) + , '\n=============================================================') # checking colnames before concat -print("checking colnames BEFORE concatenating the two dfs...") +print('checking colnames BEFORE concatenating the two dfs...') if (set(dr_df.columns) == set(other_df.columns)): print('PASS: column names match necessary for merging two dfs') else: - print("FAIL: Debug please!") + print('FAIL: Debug please!') # concatenate (axis = 0): Rbind pnca_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) # checking colnames and length after concat -print("checking colnames AFTER concatenating the two dfs...") +print('checking colnames AFTER concatenating the two dfs...') if (set(dr_df.columns) == set(pnca_LF0.columns)): print('PASS: column names match') else: - print("FAIL: Debug please!") + print('FAIL: Debug please!') -print("checking length AFTER concatenating the two dfs...") +print('checking length AFTER concatenating the two dfs...') if len(pnca_LF0) == len(dr_df) + len(other_df): - print("PASS:length of df after concat match") + print('PASS:length of df after concat match' + , '\n===============================================================') else: - print("FAIL: length mismatch") -print('======================================================================') + print('FAIL: length mismatch' + , '\n===============================================================') #%% ########### # This is hopefully clean data, but just double check @@ -612,53 +619,57 @@ if len(pnca_LF0) == len(pnca_LF1): print('PASS: length of pnca_LF0 and pnca_LF1 match', '\nconfirming extraction and concatenating worked correctly') else: - print('FAIL: BUT NOT FATAL!', - '\npnca_LF0 and pnca_LF1 lengths differ', - '\nsuggesting error in extraction process' - ' use pnca_LF1 for downstreama analysis') -print('======================================================================') -print('length of dfs pre and post processing...', - '\npnca WF data (unique samples in each row):', extracted_pnca_samples, - '\npnca LF data (unique mutation in each row):', len(pnca_LF1)) -print('======================================================================') + print('FAIL: BUT NOT FATAL!' + , '\npnca_LF0 and pnca_LF1 lengths differ' + , '\nsuggesting error in extraction process' + , ' use pnca_LF1 for downstreama analysis' + , '\n=========================================================') +print('length of dfs pre and post processing...' + , '\npnca WF data (unique samples in each row):', extracted_pnca_samples + , '\npnca LF data (unique mutation in each row):', len(pnca_LF1) + , '\n=============================================================') + #%% # final sanity check print('Verifying whether extraction process worked correctly...') if len(pnca_LF1) == expected_rows: - print('PASS: extraction process performed correctly', - '\nexpected length:', expected_rows, - '\ngot:', len(pnca_LF1), - '\nRESULT: Total no. of mutant sequences for logo plot:', len(pnca_LF1) ) - + print('PASS: extraction process performed correctly' + , '\nexpected length:', expected_rows + , '\ngot:', len(pnca_LF1) + , '\nRESULT: Total no. of mutant sequences for logo plot:', len(pnca_LF1) + , '\n=========================================================') else: - print('FAIL: extraction process has bugs', - '\nexpected length:', expected_rows, - '\ngot:', len(pnca_LF1), - '\Debug please') + print('FAIL: extraction process has bugs' + , '\nexpected length:', expected_rows + , '\ngot:', len(pnca_LF1) + , ', \Debug please' + , '\n=========================================================') #%% print('Perfmorning some more sanity checks...') # From LF1: # no. of unique muts distinct_muts = pnca_LF1.mutation.value_counts() -print("Distinct mutations:", len(distinct_muts)) +print('Distinct mutations:', len(distinct_muts)) # no. of samples contributing the unique muta pnca_LF1.id.nunique() -print("No.of samples contributing to distinct muts:", pnca_LF1.id.nunique() ) +print('No.of samples contributing to distinct muts:', pnca_LF1.id.nunique() ) # no. of dr and other muts mut_grouped = pnca_LF1.groupby('mutation_info').mutation.nunique() -print("No.of unique dr and other muts:", pnca_LF1.groupby('mutation_info').mutation.nunique() ) +print('No.of unique dr and other muts:', pnca_LF1.groupby('mutation_info').mutation.nunique() ) # sanity check if len(distinct_muts) == mut_grouped.sum() : - print("PASS:length of LF1 is as expected ") + print('PASS:length of LF1 is as expected' + , '\n===============================================================') else: - print('FAIL: Mistmatch in count of muts', - '\nexpected count:', len(distinct_muts), - '\nactual count:', mut_grouped.sum(), - '\nmuts should be distinct within dr* and other* type', - '\ninspecting ...') + print('FAIL: Mistmatch in count of muts' + , '\nexpected count:', len(distinct_muts) + , '\nactual count:', mut_grouped.sum() + , '\nmuts should be distinct within dr* and other* type' + , '\ninspecting ...' + , '\n=========================================================') muts_split = list(pnca_LF1.groupby('mutation_info')) dr_muts = muts_split[0][1].mutation other_muts = muts_split[1][1].mutation @@ -670,36 +681,40 @@ else: # sanity check: There should not be any common muts # i.e the same mutation cannot be classed as a 'drug' AND 'others' if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: - print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category') + print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' + , '\n===============================================================') else: - print('PASS: NO ambiguous muts detected', - '\nMuts are unique to dr_ and other_ mutation class') + print('PASS: NO ambiguous muts detected' + , '\nMuts are unique to dr_ and other_ mutation class' + , '\n=========================================================') # inspect dr_muts and other muts if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: - print('Finding ambiguous muts...', - '\n==========================================================', - '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum(), - '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)], - '\n==========================================================', - '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum(), - '\nThese are:\n', other_muts[other_muts.isin(dr_muts)], - '\n==========================================================') + print('Finding ambiguous muts...' + , '\n=========================================================' + , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() + , '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)] + , '\n=========================================================' + , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum(), + , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)], + , '\n=========================================================') else: - print('Error: ambiguous muts present, but extraction failed. Debug!') + print('Error: ambiguous muts present, but extraction failed. Debug!' + , '\n===============================================================') -print('======================================================================') print('Counting no. of ambiguous muts...') if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)), - 'list of ambiguous mutations (see below):', *common_muts, sep = '\n') + 'list of ambiguous mutations (see below):', *common_muts, sep = '\n' + , '\n=========================================================') else: - print('Error: ambiguous muts detected, but extraction failed. Debug!', - '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() ), - '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())) -print('======================================================================') + print('Error: ambiguous muts detected, but extraction failed. Debug!' + , '\nNo. of ambiguous muts in dr:', len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:', len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\n=========================================================') + #%% clear variables del(id_dr, id_other, meta_data, meta_pnca_dr, meta_pnca_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_pnca_count, pnca_LF0, pnca_na) @@ -723,18 +738,21 @@ print('Writing file: ambiguous muts', inspect = pnca_LF1[pnca_LF1['mutation'].isin(common_muts)] inspect.to_csv(outfile1) -print('Finished writing:', out_filename1, - '\nNo. of rows:', len(inspect), - '\nNo. of cols:', len(inspect.columns), - '\nNo. of rows = no. of samples with the ambiguous muts present:', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()) -print('======================================================================') +print('Finished writing:', out_filename1 + , '\nNo. of rows:', len(inspect) + , '\nNo. of cols:', len(inspect.columns) + , '\nNo. of rows = no. of samples with the ambiguous muts present:' + , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() + , '\n=============================================================') + del(out_filename1) - #%% read aa dict and pull relevant info -print('Reading aa dict and fetching1 letter aa code', - '\nFormatting mutation in mCSM style format: {WT}{MUT}', - '\nAdding aa properties') +print('Reading aa dict and fetching1 letter aa code' + , '\nFormatting mutation in mCSM style format: {WT}{MUT}' + , '\nAdding aa properties' + , '\n============================================================') + #=========== # Split 'mutation' column into three: wild_type, position and # mutant_type separately. Then map three letter code to one using @@ -743,6 +761,7 @@ print('Reading aa dict and fetching1 letter aa code', # Second: convert to mutation to lowercase for compatibility with dict #=========== pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower() + #======= # Iterate through the dict, create a lookup dict i.e # lookup_dict = {three_letter_code: one_letter_code}. @@ -751,6 +770,7 @@ pnca_LF1['mutation'] = pnca_LF1.loc[:, 'mutation'].str.lower() # The three letter code is extracted using a string match match from the dataframe and then converted # to 'pandas series'since map only works in pandas series #======= + lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] @@ -764,7 +784,7 @@ pnca_LF1['position'] = pnca_LF1['mutation'].str.extract(r'(\d+)') # clear variables del(k, v, wt, mut, lookup_dict) -#print('======================================================================') + #========= # iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_water} @@ -785,7 +805,7 @@ for k, v in my_aa_dict.items(): # clear variables del(k, v, wt, mut, lookup_dict) -#print('======================================================================') + #======== # iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_polarity} @@ -806,7 +826,6 @@ for k, v in my_aa_dict.items(): # clear variables del(k, v, wt, mut, lookup_dict) -#print('======================================================================') #======== # iterate through the dict, create a lookup dict that i.e @@ -827,7 +846,6 @@ del(k, v, wt, mut, lookup_dict) # added two more cols # clear variables #del(k, v, wt, mut, lookup_dict) -#print('======================================================================') #======== # iterate through the dict, create a lookup dict that i.e @@ -847,15 +865,16 @@ for k, v in my_aa_dict.items(): # added two more cols # clear variables del(k, v, wt, mut, lookup_dict) -print('======================================================================') + ######## # combine the wild_type+poistion+mutant_type columns to generate # Mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation ######### pnca_LF1['Mutationinformation'] = pnca_LF1['wild_type'] + pnca_LF1.position.map(str) + pnca_LF1['mutant_type'] -print('Created column: Mutationinformation') -print('======================================================================') +print('Created column: Mutationinformation' + , '\n===============================================================') + #%% Write file: mCSM muts snps_only = pd.DataFrame(pnca_LF1['Mutationinformation'].unique()) snps_only.head() @@ -867,47 +886,48 @@ pos_only = pd.DataFrame(pnca_LF1['position'].unique()) print('Checking NA in snps...')# should be 0 if snps_only.Mutationinformation.isna().sum() == 0: - print ('PASS: NO NAs/missing entries for SNPs') + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') else: - print('FAIL: SNP has NA, Possible mapping issues from dict?', - '\nDebug please!') -print('======================================================================') + print('FAIL: SNP has NA, Possible mapping issues from dict?' + , '\nDebug please!' + , '\n=========================================================') out_filename2 = gene.lower() + '_' + 'mcsm_snps.csv' outfile2 = outdir + '/' + out_filename2 -print('Writing file: mCSM style muts', - '\nFilename:', out_filename2, - '\nPath:', outdir, - '\nmutation format (SNP): {WT}{MUT}', - '\nNo. of distinct muts:', len(snps_only), - '\nNo. of distinct positions:', len(pos_only)) +print('Writing file: mCSM style muts' + , '\nFilename:', out_filename2 + , '\nPath:', outdir + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo. of distinct muts:', len(snps_only) + , '\nNo. of distinct positions:', len(pos_only) + , '\n=============================================================') snps_only.to_csv(outfile2, header = False, index = False) -print('Finished writing:', out_filename2, - '\nNo. of rows:', len(snps_only), - '\nNo. of cols:', len(snps_only.columns)) -print('======================================================================') +print('Finished writing:', out_filename2 + , '\nNo. of rows:', len(snps_only) + , '\nNo. of cols:', len(snps_only.columns) + , '\n=============================================================') del(out_filename2) - #%% Write file: pnca_metadata (i.e pnca_LF1) # where each row has UNIQUE mutations NOT unique sample ids out_filename3 = gene.lower() + '_' + 'metadata.csv' outfile3 = outdir + '/' + out_filename3 -print('Writing file: LF formatted data', - '\nFilename:', out_filename3, - '\nPath:', outdir) +print('Writing file: LF formatted data' + , '\nFilename:', out_filename3 + , '\nPath:', outdir + , '\n============================================================') pnca_LF1.to_csv(outfile3, header = True, index = False) -print('Finished writing:', out_filename3, - '\nNo. of rows:', len(pnca_LF1), - '\nNo. of cols:', len(pnca_LF1.columns) ) -print('======================================================================') +print('Finished writing:', out_filename3 + , '\nNo. of rows:', len(pnca_LF1) + , '\nNo. of cols:', len(pnca_LF1.columns) + , '\n=============================================================') del(out_filename3) - #%% write file: mCSM style but with repitions for MSA and logo plots all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation']) all_muts_msa.head() @@ -930,11 +950,12 @@ all_muts_msa_sorted.head() print('Checking NA in snps...')# should be 0 if all_muts_msa.Mutationinformation.isna().sum() == 0: - print ('PASS: NO NAs/missing entries for SNPs') + print ('PASS: NO NAs/missing entries for SNPs' + , '\n===============================================================') else: - print('FAIL: SNP has NA, Possible mapping issues from dict?', - '\nDebug please!') -print('======================================================================') + print('FAIL: SNP has NA, Possible mapping issues from dict?' + , '\nDebug please!' + , '\n=========================================================') out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv' outfile4 = outdir + '/' + out_filename4 @@ -948,12 +969,12 @@ print('Writing file: mCSM style muts for msa', all_muts_msa_sorted.to_csv(outfile4, header = False, index = False) -print('Finished writing:', out_filename4, - '\nNo. of rows:', len(all_muts_msa), - '\nNo. of cols:', len(all_muts_msa.columns) ) -print('======================================================================') -del(out_filename4) +print('Finished writing:', out_filename4 + , '\nNo. of rows:', len(all_muts_msa) + , '\nNo. of cols:', len(all_muts_msa.columns) + , '\n=============================================================') +del(out_filename4) #%% write file for mutational positions # count how many positions this corresponds to @@ -971,22 +992,22 @@ pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) out_filename5 = gene.lower() + '_' + 'mutational_positons.csv' outfile5 = outdir + '/' + out_filename5 -print('Writing file: mutational positions', - '\nNo. of distinct positions:', len(pos_only_sorted), - '\nFilename:', out_filename5, - '\nPath:', outdir) +print('Writing file: mutational positions' + , '\nNo. of distinct positions:', len(pos_only_sorted) + , '\nFilename:', out_filename5 + , '\nPath:', outdir + , '\n=============================================================') pos_only_sorted.to_csv(outfile5, header = True, index = False) -print('Finished writing:', out_filename5, - '\nNo. of rows:', len(pos_only_sorted), - '\nNo. of cols:', len(pos_only_sorted.columns) ) -print('======================================================================') +print('Finished writing:', out_filename5 + , '\nNo. of rows:', len(pos_only_sorted) + , '\nNo. of cols:', len(pos_only_sorted.columns) + , '\n=============================================================') + del(out_filename5) - - -#%% end of script +#======================================================================= print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files' '\n' + u'\u2698' * 50 ) -#%% +#%% end of script diff --git a/meta_data_analysis/dssp_df.py b/meta_data_analysis/dssp_df.py index 5a5b479..7b59cfa 100755 --- a/meta_data_analysis/dssp_df.py +++ b/meta_data_analysis/dssp_df.py @@ -55,13 +55,12 @@ indir = datadir + '/' + drug + '/' + 'output' in_filename = gene.lower() +'.dssp' infile = indir + '/' + in_filename print('Input filename:', in_filename - , '\nInput path:', indir) + , '\nInput path:', indir + , '\n============================================================') # specify PDB chain my_chain = 'A' -print('======================================================================') - #======= # output #======= @@ -70,9 +69,9 @@ out_filename = gene.lower() + '_dssp.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename , '\nOutput path:', outdir - ,'\nOutfile: ', outfile) + , '\nOutfile: ', outfile + , '\n=============================================================') -print('======================================================================') #%% end of variable assignment for input and output files #======================================================================= # Process dssp output and extract into df @@ -96,14 +95,15 @@ dssp_df.columns #%% Write ouput csv file print('Writing file:', outfile , '\nFilename:', out_filename - , '\nPath:', outdir) + , '\nPath:', outdir + , '\n=============================================================') # write to csv dssp_df.to_csv(outfile, header=True, index = False) print('Finished writing:', out_filename , '\nNo. of rows:', len(dssp_df) - , '\nNo. of cols:', len(dssp_df.columns)) -print('======================================================================') + , '\nNo. of cols:', len(dssp_df.columns) + , '\n==============================================================') #%% end of script #======================================================================= diff --git a/meta_data_analysis/kd_df.py b/meta_data_analysis/kd_df.py index 0fd1ccc..2566cf8 100644 --- a/meta_data_analysis/kd_df.py +++ b/meta_data_analysis/kd_df.py @@ -56,9 +56,8 @@ indir = datadir + '/' + drug + '/' + 'input' in_filename = '3pl1.fasta.txt' infile = indir + '/' + in_filename print('Input filename:', in_filename - , '\nInput path:', indir) - -print('======================================================================') + , '\nInput path:', indir + , '\n============================================================') #======= # output @@ -67,9 +66,8 @@ outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_kd.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename - , '\nOutput path:', outdir) - -print('======================================================================') + , '\nOutput path:', outdir + , '\n=============================================================') #%% end of variable assignment for input and output files #======================================================================= #%%specify window size for hydropathy profile computation @@ -96,7 +94,7 @@ print('Sequence Length:', num_residues) print('kd_values Length:',len(kd_values)) print('Window Length:', my_window) print('Window Offset:', offset) -print('======================================================================') +print('=================================================================') print('Checking:len(kd values) is as expected for the given window size & offset...') expected_length = num_residues - (my_window - offset) if len(kd_values) == expected_length: @@ -104,9 +102,8 @@ if len(kd_values) == expected_length: else: print('FAIL: length mismatch' ,'\nExpected length:', expected_length - ,'\nActual length:', len(kd_values)) - -print('======================================================================') + ,'\nActual length:', len(kd_values) + , '\n=========================================================') #%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df # which will allow easy merging of the two dfs. @@ -138,10 +135,11 @@ kd_df = pd.concat([dfSeq, dfVals], axis = 1) #============================ kd_df = kd_df.rename_axis('position') kd_df.head -print('======================================================================') +print('=================================================================') -print('position col i.e. index should be numeric') -print('======================================================================') +print('position col i.e. index should be numeric + , '\n===============================================================') + if kd_df.index.dtype == 'int64': print('PASS: position col is numeric' , '\ndtype is:', kd_df.index.dtype) @@ -150,19 +148,20 @@ else: , '\nConverting to numeric') kd_df.index.astype('int64') print('Checking dtype for after conversion:\n' - ,'\ndtype is:', kd_df.index.dtype) - -print('======================================================================') + , '\ndtype is:', kd_df.index.dtype + , '\n=========================================================') #%% write file print('Writing file:', out_filename , '\nFilename:', out_filename - , '\nPath:', outdir) + , '\nPath:', outdir + , '\n=============================================================') kd_df.to_csv(outfile, header = True, index = True) print('Finished writing:', out_filename , '\nNo. of rows:', len(kd_df) - , '\nNo. of cols:', len(kd_df.columns)) + , '\nNo. of cols:', len(kd_df.columns) + , '\n=============================================================') #%% plot # http://www.dalkescientific.com/writings/NBN/plotting.html @@ -176,7 +175,6 @@ xlabel('Residue Number') ylabel('Hydrophobicity') title('K&D Hydrophobicity for ' + id) show() - print('======================================================================') #%% end of script #======================================================================= diff --git a/meta_data_analysis/mut_electrostatic_changes.py b/meta_data_analysis/mut_electrostatic_changes.py index bb193f0..20ea86a 100755 --- a/meta_data_analysis/mut_electrostatic_changes.py +++ b/meta_data_analysis/mut_electrostatic_changes.py @@ -46,7 +46,8 @@ indir = datadir + '/' + drug + '/' + 'input' in_filename = 'merged_df3.csv' infile = outdir + '/' + in_filename print('Input filename: ', in_filename - , '\nInput path: ', indir) + , '\nInput path: ', indir + , '\n============================================================') #======= # output @@ -56,7 +57,8 @@ outdir = datadir + '/' + drug + '/' + 'output' out_filename = 'mut_elec_changes.txt' outfile = outdir + '/' + out_filename print('Output filename: ', out_filename - , '\nOutput path: ', outdir) + , '\nOutput path: ', outdir + , '\n============================================================') #%% end of variable assignment for input and output files #======================================================================= @@ -65,10 +67,11 @@ print('Reading input file (merged file):', infile) comb_df = pd.read_csv(infile, sep = ',') -print('Input filename: ', in_filename, - '\nPath :', outdir, - '\nNo. of rows: ', len(comb_df), - '\nNo. of cols: ', infile) +print('Input filename: ', in_filename + , '\nPath :', outdir + , '\nNo. of rows: ', len(comb_df) + , '\nNo. of cols: ', infile + , '\n============================================================') # column names list(comb_df.columns) @@ -81,15 +84,18 @@ df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first') total_muts = df.Mutationinformation.nunique() #df.Mutationinformation.count() -print('Total mutations associated with structure: ', total_muts) +print('Total mutations associated with structure: ', total_muts + , '\n===============================================================') #%% combine aa_calcprop cols so that you can count the changes as value_counts # check if all muts have been categorised print('Checking if all muts have been categorised: ') if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum(): - print('PASS: No. NA detected i.e all muts have aa prop associated') + print('PASS: No. NA detected i.e all muts have aa prop associated' + , '\n===============================================================') else: - print('FAIL: NAs detected i.e some muts remain unclassified') + print('FAIL: NAs detected i.e some muts remain unclassified' + , '\n===============================================================') df['wt_calcprop'].head() df['mut_calcprop'].head() @@ -151,11 +157,11 @@ print('======================\n' , '\n============================\n' , all_prop_change) -print('========================================================================' +print('=================================================================' , '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes , '\nTotal no. of muts: ', total_muts , '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum() , '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum() -, '\n=========================================================================') +, '\n===================================================================') #%% end of script #======================================================================= diff --git a/meta_data_analysis/rd_df.py b/meta_data_analysis/rd_df.py index b0921b8..0074959 100755 --- a/meta_data_analysis/rd_df.py +++ b/meta_data_analysis/rd_df.py @@ -48,9 +48,8 @@ indir = datadir + '/' + drug + '/' + 'output' in_filename = '3pl1_rd.tsv' infile = indir + '/' + in_filename print('Input filename:', in_filename - , '\nInput path:', indir) - -print('======================================================================') + , '\nInput path:', indir + , '\n=============================================================') #======= # output #======= @@ -58,9 +57,9 @@ outdir = datadir + '/' + drug + '/' + 'output' out_filename = gene.lower() + '_rd.csv' outfile = outdir + '/' + out_filename print('Output filename:', out_filename - , '\nOutput path:', outdir) - -print('======================================================================') + , '\nOutput path:', outdir + , '\n=============================================================') + #%% end of variable assignment for input and output files #======================================================================= #%% Read input file @@ -69,9 +68,8 @@ print('Reading input file:', infile , '\nNo. of rows:', len(rd_data) , '\nNo. of cols:', len(rd_data.columns)) -print('Column names:', rd_data.columns) - -print('======================================================================') +print('Column names:', rd_data.columns + , '\n===============================================================') #======================== # creating position col #======================== @@ -85,19 +83,18 @@ rd_data['position'].dtype print('Extracted residue num from index and assigned as a column:' , '\ncolumn name: position' - , '\ntotal no. of cols now:', len(rd_data.columns)) -print('======================================================================') + , '\ntotal no. of cols now:', len(rd_data.columns) + , '\n=============================================================') #======================== # Renaming amino-acid # and all-atom cols #======================== print('Renaming columns:' - ,'\ncolname==> # chain:residue: wt_3letter_caps' - ,'\nYES... the column name *actually* contains a # ..!' - ,'\ncolname==> all-atom: rd_values') - -print('======================================================================') + , '\ncolname==> # chain:residue: wt_3letter_caps' + , '\nYES... the column name *actually* contains a # ..!' + , '\ncolname==> all-atom: rd_values' + , '\n=============================================================') rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True) print('Column names:', rd_data.columns) @@ -117,22 +114,22 @@ if len(rd_df) == len(rd_data): ,'\nNo. of cols:', len(rd_df.columns)) else: print('FAIL: no. of rows mimatch' - ,'\nExpected no. of rows:', len(rd_data) - ,'\nGot no. of rows:', len(rd_df)) - -print('======================================================================') + , '\nExpected no. of rows:', len(rd_data) + , '\nGot no. of rows:', len(rd_df) + , '\n=========================================================') #%% write file print('Writing file:' , '\nFilename:', out_filename - , '\nPath:', outdir) + , '\nPath:', outdir + , '\n=============================================================') rd_df.to_csv(outfile, header = True, index = False) -print('======================================================================') print('Finished writing:', out_filename , '\nNo. of rows:', len(rd_df) - , '\nNo. of cols:', len(rd_df.columns)) + , '\nNo. of cols:', len(rd_df.columns) + , '\n=============================================================') #%% end of script #======================================================================= diff --git a/meta_data_analysis/reference_dict.py b/meta_data_analysis/reference_dict.py index 7a6e3f6..0461523 100644 --- a/meta_data_analysis/reference_dict.py +++ b/meta_data_analysis/reference_dict.py @@ -44,7 +44,8 @@ indir = datadir + '/' + drug + 'input' in_filename = 'aa_codes.csv' infile = indir + '/' + in_filename print('Input filename:', in_filename - , '\nInput path:', indir) + , '\nInput path:', indir + , '\n============================================================') #======= # output: No output