diff --git a/scripts/combine_dfs.py b/scripts/combine_dfs.py index 261e514..a6c78ba 100755 --- a/scripts/combine_dfs.py +++ b/scripts/combine_dfs.py @@ -46,10 +46,8 @@ os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') -#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name', default = None) # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output @@ -101,178 +99,178 @@ print('Output filename:', out_filename #%% function/methd to combine 4 dfs def combine_dfs(dssp_csv, kd_csv, rd_csv, mcsm_csv, out_combined_csv): - """ - Combine 4 dfs + """ + Combine 4 dfs - @param dssp_df: csv file (output from dssp_df.py) - @type dssp_df: string + @param dssp_df: csv file (output from dssp_df.py) + @type dssp_df: string - @param kd_df: csv file (output from kd_df.py) - @type ks_df: string - - @param rd_df: csv file (output from rd_df.py) - @type rd_df: string + @param kd_df: csv file (output from kd_df.py) + @type ks_df: string + + @param rd_df: csv file (output from rd_df.py) + @type rd_df: string - # FIXME - @param mcsm_df: csv file (output of mcsm pipeline)CHECK} - @type mcsm_df: string + # FIXME + @param mcsm_df: csv file (output of mcsm pipeline)CHECK} + @type mcsm_df: string - @param out_combined_csv: csv file output - @type out_combined_csv: string - - @return: none, writes combined df as csv - """ - #======================== - # read input csv files to combine - #======================== - dssp_df = pd.read_csv(dssp_csv, sep = ',') - kd_df = pd.read_csv(kd_csv, sep = ',') - rd_df = pd.read_csv(rd_csv, sep = ',') - mcsm_df = pd.read_csv(mcsm_csv, sep = ',') + @param out_combined_csv: csv file output + @type out_combined_csv: string + + @return: none, writes combined df as csv + """ + #======================== + # read input csv files to combine + #======================== + dssp_df = pd.read_csv(dssp_csv, sep = ',') + kd_df = pd.read_csv(kd_csv, sep = ',') + rd_df = pd.read_csv(rd_csv, sep = ',') + mcsm_df = pd.read_csv(mcsm_csv, sep = ',') - print('Reading input files:' - , '\ndssp file:', dssp_csv - , '\nNo. of rows:', len(dssp_df) - , '\nNo. of cols:', len(dssp_df.columns) - , '\nColumn names:', dssp_df.columns - , '\n===================================================================' - , '\nkd file:', kd_csv - , '\nNo. of rows:', len(kd_df) - , '\nNo. of cols:', len(kd_df.columns) - , '\nColumn names:', kd_df.columns - , '\n===================================================================' - , '\nrd file:', rd_csv - , '\nNo. of rows:', len(rd_df) - , '\nNo. of cols:', len(rd_df.columns) - , '\nColumn names:', rd_df.columns - , '\n===================================================================' - , '\nrd file:', mcsm_csv - , '\nNo. of rows:', len(mcsm_df) - , '\nNo. of cols:', len(mcsm_df.columns) - , '\nColumn names:', mcsm_df.columns - , '\n===================================================================') - - #======================== - # merge 1 (combined_df1) - # concatenating 3dfs: - # dssp_df, kd_df, rd_df - #======================== - print('starting first merge...\n') + print('Reading input files:' + , '\ndssp file:', dssp_csv + , '\nNo. of rows:', len(dssp_df) + , '\nNo. of cols:', len(dssp_df.columns) + , '\nColumn names:', dssp_df.columns + , '\n===================================================================' + , '\nkd file:', kd_csv + , '\nNo. of rows:', len(kd_df) + , '\nNo. of cols:', len(kd_df.columns) + , '\nColumn names:', kd_df.columns + , '\n===================================================================' + , '\nrd file:', rd_csv + , '\nNo. of rows:', len(rd_df) + , '\nNo. of cols:', len(rd_df.columns) + , '\nColumn names:', rd_df.columns + , '\n===================================================================' + , '\nrd file:', mcsm_csv + , '\nNo. of rows:', len(mcsm_df) + , '\nNo. of cols:', len(mcsm_df.columns) + , '\nColumn names:', mcsm_df.columns + , '\n===================================================================') + + #======================== + # merge 1 (combined_df1) + # concatenating 3dfs: + # dssp_df, kd_df, rd_df + #======================== + print('starting first merge...\n') - # checking no. of rows - print('Checking if no. of rows of the 3 dfs are equal:\n' - , len(dssp_df) == len(kd_df) == len(rd_df) - , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' - , '\n===================================================================') + # checking no. of rows + print('Checking if no. of rows of the 3 dfs are equal:\n' + , len(dssp_df) == len(kd_df) == len(rd_df) + , '\nReason: fasta files and pdb files vary since not all pos are part of the structure' + , '\n===================================================================') - # variables for sanity checks - expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) - # beware of harcoding! used for sanity check - ndfs = 3 - ncol_merge = 1 - offset = ndfs- ncol_merge - expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset + # variables for sanity checks + expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df)) + # beware of harcoding! used for sanity check + ndfs = 3 + ncol_merge = 1 + offset = ndfs- ncol_merge + expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset - print('Merge 1:' - , '\ncombining 3dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows_df1 - , '\nExpected ncols in combined_df:', expected_cols_df1 - , '\nResetting the common col as the index' - , '\n===================================================================') + print('Merge 1:' + , '\ncombining 3dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows_df1 + , '\nExpected ncols in combined_df:', expected_cols_df1 + , '\nResetting the common col as the index' + , '\n===================================================================') - #dssp_df.set_index('position', inplace = True) - #kd_df.set_index('position', inplace = True) - #rd_df.set_index('position', inplace =True) + #dssp_df.set_index('position', inplace = True) + #kd_df.set_index('position', inplace = True) + #rd_df.set_index('position', inplace =True) - #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() - #combined_df.rename(columns = {'index':'position'}) + #combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index() + #combined_df.rename(columns = {'index':'position'}) - combined_df1 = pd.concat( - (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) - , axis = 1, join = 'outer').reset_index() + combined_df1 = pd.concat( + (my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df]) + , axis = 1, join = 'outer').reset_index() - # sanity check - print('Checking dimensions of concatenated df1...') - if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: - print('PASS: combined df has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df1) - , '\nNo. of cols in combined df:', len(combined_df1.columns) - , '\n===============================================================') - else: - print('FAIL: combined df does not have expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df1) - , '\nNo. of cols in combined df:', len(combined_df1.columns) - , '\n===============================================================') + # sanity check + print('Checking dimensions of concatenated df1...') + if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1: + print('PASS: combined df has expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df1) + , '\nNo. of cols in combined df:', len(combined_df1.columns) + , '\n===============================================================') + else: + print('FAIL: combined df does not have expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df1) + , '\nNo. of cols in combined df:', len(combined_df1.columns) + , '\n===============================================================') - #======================== - # merge 2 (combined_df2) - # concatenating 2dfs: - # mcsm_df, combined_df1 (result of merge1) - # sort the cols - #======================== - print('starting second merge...\n') - - # rename col 'Position' in mcsm_df to lowercase 'position' - # as it matches the combined_df1 colname to perfom merge - - #mcsm_df.columns - #mcsm_df.rename(columns = {'Position':'position'}) # not working! - # copy 'Position' column with the correct colname - print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' - , '\nNo. of cols before copying: ', len(mcsm_df.columns)) + #======================== + # merge 2 (combined_df2) + # concatenating 2dfs: + # mcsm_df, combined_df1 (result of merge1) + # sort the cols + #======================== + print('starting second merge...\n') + + # rename col 'Position' in mcsm_df to lowercase 'position' + # as it matches the combined_df1 colname to perfom merge + + #mcsm_df.columns + #mcsm_df.rename(columns = {'Position':'position'}) # not working! + # copy 'Position' column with the correct colname + print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging' + , '\nNo. of cols before copying: ', len(mcsm_df.columns)) - mcsm_df['position'] = mcsm_df['Position'] - print('No. of cols after copying: ', len(mcsm_df.columns)) + mcsm_df['position'] = mcsm_df['Position'] + print('No. of cols after copying: ', len(mcsm_df.columns)) - # sanity check - if mcsm_df['position'].equals(mcsm_df['Position']): - print('PASS: Copying worked correctly' - , '\ncopied col matches original column' - , '\n===============================================================') - else: - print('FAIL: copied col does not match original column' - , '\n================================================================') + # sanity check + if mcsm_df['position'].equals(mcsm_df['Position']): + print('PASS: Copying worked correctly' + , '\ncopied col matches original column' + , '\n===============================================================') + else: + print('FAIL: copied col does not match original column' + , '\n================================================================') - # variables for sanity checks - expected_rows_df2 = len(mcsm_df) - # beware of harcoding! used for sanity check - ndfs = 2 - ncol_merge = 1 - offset = ndfs - ncol_merge - expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset + # variables for sanity checks + expected_rows_df2 = len(mcsm_df) + # beware of harcoding! used for sanity check + ndfs = 2 + ncol_merge = 1 + offset = ndfs - ncol_merge + expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset - print('Merge 2:' - , '\ncombining 2dfs by commom col: position' - , '\nExpected nrows in combined_df:', expected_rows_df2 - , '\nExpected ncols in combined_df:', expected_cols_df2 - , '\n===================================================================') + print('Merge 2:' + , '\ncombining 2dfs by commom col: position' + , '\nExpected nrows in combined_df:', expected_rows_df2 + , '\nExpected ncols in combined_df:', expected_cols_df2 + , '\n===================================================================') - combined_df2 = mcsm_df.merge(combined_df1, on = 'position') + combined_df2 = mcsm_df.merge(combined_df1, on = 'position') - # sanity check - print('Checking dimensions of concatenated df2...') - if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: - print('PASS: combined df2 has expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df2) - , '\nNo. of cols in combined df:', len(combined_df2.columns) - , '\n===============================================================') - else: - print('FAIL: combined df2 does not have expected dimensions' - , '\nNo. of rows in combined df:', len(combined_df2) - , '\nNo. of cols in combined df:', len(combined_df2.columns) - , '\n===============================================================') - - #=============== - # writing file - #=============== - print('Writing file:' - , '\nFilename:', out_combined_csv -# , '\nPath:', outdir - , '\nExpected no. of rows:', len(combined_df2) - , '\nExpected no. of cols:', len(combined_df2.columns) - , '\n=========================================================') + # sanity check + print('Checking dimensions of concatenated df2...') + if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2: + print('PASS: combined df2 has expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df2) + , '\nNo. of cols in combined df:', len(combined_df2.columns) + , '\n===============================================================') + else: + print('FAIL: combined df2 does not have expected dimensions' + , '\nNo. of rows in combined df:', len(combined_df2) + , '\nNo. of cols in combined df:', len(combined_df2.columns) + , '\n===============================================================') + + #=============== + # writing file + #=============== + print('Writing file:' + , '\nFilename:', out_combined_csv +# , '\nPath:', outdir + , '\nExpected no. of rows:', len(combined_df2) + , '\nExpected no. of cols:', len(combined_df2.columns) + , '\n=========================================================') - combined_df2.to_csv(out_combined_csv, header = True, index = False) + combined_df2.to_csv(out_combined_csv, header = True, index = False) #%% end of function #======================================================================= @@ -280,19 +278,18 @@ def combine_dfs(dssp_csv, kd_csv, rd_csv, mcsm_csv, out_combined_csv): #combine_dfs(infile1, infile2, infile3, infile4, outfile) #======================================================================= def main(): - print('Combining 4 dfs:\n' - , in_filename1, '\n' - , in_filename2, '\n' - , in_filename3, '\n' - , in_filename4, '\n' - , 'output csv:', out_filename) - combine_dfs(infile1, infile2, infile3, infile4, outfile) - print('Finished Writing file:' - , '\nFilename:', out_filename - , '\nPath:', outdir -## , '\nNo. of rows:', '' -## , '\nNo. of cols:', '' - , '\n===========================================================') + print('Combining 4 dfs:\n' + , in_filename1, '\n' + , in_filename2, '\n' + , in_filename3, '\n' + , in_filename4, '\n' + , 'output csv:', out_filename) + combine_dfs(infile1, infile2, infile3, infile4, outfile) + print('Finished Writing file:' + , '\nFilename:', outfile +## , '\nNo. of rows:', '' +## , '\nNo. of cols:', '' + , '\n===========================================================') if __name__ == '__main__': main() diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index e734e55..578036c 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -57,8 +57,8 @@ args = arg_parser.parse_args() drug = args.drug gene = args.gene - gene_match = gene + '_p.' + # building cols to extract dr_muts_col = 'dr_mutations_' + drug other_muts_col = 'other_mutations_' + drug @@ -80,8 +80,8 @@ datadir = homedir + '/' + 'git/Data' #======= # input #======= -#in_filename = 'original_tanushree_data_v2.csv' -in_filename = 'mtb_gwas_v3.csv' +in_filename = 'original_tanushree_data_v2.csv' +#in_filename = 'mtb_gwas_v3.csv' infile = datadir + '/' + in_filename print('Input file: ', infile , '\n============================================================') @@ -1028,25 +1028,25 @@ del(k, v, wt, mut, lookup_dict) ######## # combine the wild_type+poistion+mutant_type columns to generate -# Mutationinformation (matches mCSM output field) +# mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation ######### -gene_LF1['Mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] -print('Created column: Mutationinformation' +gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] +print('Created column: mutationinformation' , '\n=====================================================================' - , gene_LF1.Mutationinformation.head(10)) + , gene_LF1.mutationinformation.head(10)) #%% Write file: mCSM muts -snps_only = pd.DataFrame(gene_LF1['Mutationinformation'].unique()) +snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique()) snps_only.head() # assign column name -snps_only.columns = ['Mutationinformation'] +snps_only.columns = ['mutationinformation'] # count how many positions this corresponds to pos_only = pd.DataFrame(gene_LF1['position'].unique()) print('Checking NA in snps...')# should be 0 -if snps_only.Mutationinformation.isna().sum() == 0: +if snps_only.mutationinformation.isna().sum() == 0: print ('PASS: NO NAs/missing entries for SNPs' , '\n===============================================================') else: @@ -1090,27 +1090,27 @@ print('Finished writing:', out_filename3 del(out_filename3) #%% write file: mCSM style but with repitions for MSA and logo plots -all_muts_msa = pd.DataFrame(gene_LF1['Mutationinformation']) +all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation']) all_muts_msa.head() # assign column name -all_muts_msa.columns = ['Mutationinformation'] +all_muts_msa.columns = ['mutationinformation'] # make sure it is string all_muts_msa.columns.dtype # sort the column -all_muts_msa_sorted = all_muts_msa.sort_values(by = 'Mutationinformation') +all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation') # create an extra column with protein name all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') all_muts_msa_sorted.head() # rearrange columns so the fasta name is the first column (required for mutate.script) -all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'Mutationinformation']] +all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']] all_muts_msa_sorted.head() print('Checking NA in snps...')# should be 0 -if all_muts_msa.Mutationinformation.isna().sum() == 0: +if all_muts_msa.mutationinformation.isna().sum() == 0: print ('PASS: NO NAs/missing entries for SNPs' , '\n===============================================================') else: diff --git a/scripts/dssp_df.py b/scripts/dssp_df.py index c2d8795..3f94149 100755 --- a/scripts/dssp_df.py +++ b/scripts/dssp_df.py @@ -30,10 +30,8 @@ os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') -arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output @@ -49,6 +47,8 @@ args = arg_parser.parse_args() drug = args.drug gene = args.gene +gene_match = gene + '_p.' + #========== # data dir #========== @@ -147,7 +147,7 @@ def extract_chain_dssp(inputpdbfile): return pdbchainlist #======================================================================= #%% write csv of processed dssp output -def dssp_to_csv(inputdsspfile, outfile, pdbchainlist): +def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']): """ Create a df from a dssp file containing ASA, RSA, SS for all chains diff --git a/scripts/kd_df.py b/scripts/kd_df.py index 2c3bd48..75fbcd5 100755 --- a/scripts/kd_df.py +++ b/scripts/kd_df.py @@ -39,10 +39,8 @@ os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName') +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name', default = None) #arg_parser.add_argument('-p', '--plot', help='show plot', action='store_true') args = arg_parser.parse_args() #======================================================================= @@ -81,8 +79,8 @@ print('Output filename:', out_filename #%% end of variable assignment for input and output files #======================================================================= #%% kd values from fasta file and output csv -def kd_to_csv(inputfasta, outputkdcsv, windowsize): - """ +def kd_to_csv(inputfasta, outputkdcsv, windowsize = 3): + """ Calculate kd (hydropathy values) from input fasta file @param inputfasta: fasta file @@ -96,138 +94,137 @@ def kd_to_csv(inputfasta, outputkdcsv, windowsize): @return: none, writes kd values df as csv """ - #======================== - # read input fasta file - #======================== - fh = open(inputfasta) + #======================== + # read input fasta file + #======================== + fh = open(inputfasta) - for record in SeqIO.parse(fh, 'fasta'): - id = record.id - seq = record.seq - num_residues = len(seq) - fh.close() + for record in SeqIO.parse(fh, 'fasta'): + id = record.id + seq = record.seq + num_residues = len(seq) + fh.close() - sequence = str(seq) - X = ProteinAnalysis(sequence) + sequence = str(seq) + X = ProteinAnalysis(sequence) - #=================== + #=================== # calculate KD values: same as the expasy server #=================== - my_window = windowsize - offset = round((my_window/2)-0.5) - # edge weight is set to default (100%) - - kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) - # sanity checks - print('Sequence Length:', num_residues) - print('kd_values Length:',len(kd_values)) - print('Window Length:', my_window) - print('Window Offset:', offset) - print('=================================================================') - print('Checking:len(kd values) is as expected for the given window size & offset...') - expected_length = num_residues - (my_window - offset) - if len(kd_values) == expected_length: - print('PASS: expected and actual length of kd values match') - else: - print('FAIL: length mismatch' - ,'\nExpected length:', expected_length - ,'\nActual length:', len(kd_values) - , '\n=========================================================') + my_window = windowsize + offset = round((my_window/2)-0.5) + # edge weight is set to default (100%) + + kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) + # sanity checks + print('Sequence Length:', num_residues) + print('kd_values Length:',len(kd_values)) + print('Window Length:', my_window) + print('Window Offset:', offset) + print('=================================================================') + print('Checking:len(kd values) is as expected for the given window size & offset...') + expected_length = num_residues - (my_window - offset) + if len(kd_values) == expected_length: + print('PASS: expected and actual length of kd values match') + else: + print('FAIL: length mismatch' + ,'\nExpected length:', expected_length + ,'\nActual length:', len(kd_values) + , '\n=========================================================') - #=================== - # creating two dfs - #=================== - # 1) aa sequence and 2) kd_values. Then reset index for each df - # which will allow easy merging of the two dfs. + #=================== + # creating two dfs + #=================== + # 1) aa sequence and 2) kd_values. Then reset index for each df + # which will allow easy merging of the two dfs. - # df1: df of aa seq with index reset to start from 1 - # (reflective of the actual aa position in a sequence) - # Name column of wt as 'wild_type' to be the same name used - # in the file required for merging later. - dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)}) - dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive + # df1: df of aa seq with index reset to start from 1 + # (reflective of the actual aa position in a sequence) + # Name column of wt as 'wild_type' to be the same name used + # in the file required for merging later. + dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)}) + dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive - # df2: df of kd_values with index reset to start from offset + 1 and - # subsequent matched length of the kd_values - dfVals = pd.DataFrame({'kd_values':kd_values}) - dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) + # df2: df of kd_values with index reset to start from offset + 1 and + # subsequent matched length of the kd_values + dfVals = pd.DataFrame({'kd_values':kd_values}) + dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset) - # sanity checks - max(dfVals['kd_values']) - min(dfVals['kd_values']) - - #=================== - # concatenating dfs - #=================== - # Merge the two on index - # (as these are now reflective of the aa position numbers): df1 and df2 - # This will introduce NaN where there is missing values. In our case this - # will be 2 (first and last ones based on window size and offset) + # sanity checks + max(dfVals['kd_values']) + min(dfVals['kd_values']) + + #=================== + # concatenating dfs + #=================== + # Merge the two on index + # (as these are now reflective of the aa position numbers): df1 and df2 + # This will introduce NaN where there is missing values. In our case this + # will be 2 (first and last ones based on window size and offset) - kd_df = pd.concat([dfSeq, dfVals], axis = 1) + kd_df = pd.concat([dfSeq, dfVals], axis = 1) - #============================ - # renaming index to position - #============================ - kd_df = kd_df.rename_axis('position') - kd_df.head + #============================ + # renaming index to position + #============================ + kd_df = kd_df.rename_axis('position') + kd_df.head - print('Checking: position col i.e. index should be numeric') - if kd_df.index.dtype == 'int64': - print('PASS: position col is numeric' - , '\ndtype is:', kd_df.index.dtype) - else: - print('FAIL: position col is not numeric' - , '\nConverting to numeric') - kd_df.index.astype('int64') - print('Checking dtype for after conversion:\n' - , '\ndtype is:', kd_df.index.dtype - , '\n=========================================================') + print('Checking: position col i.e. index should be numeric') + if kd_df.index.dtype == 'int64': + print('PASS: position col is numeric' + , '\ndtype is:', kd_df.index.dtype) + else: + print('FAIL: position col is not numeric' + , '\nConverting to numeric') + kd_df.index.astype('int64') + print('Checking dtype for after conversion:\n' + , '\ndtype is:', kd_df.index.dtype + , '\n=========================================================') - #=============== - # writing file - #=============== - print('Writing file:' - , '\nFilename:', outputkdcsv -# , '\nPath:', outdir - , '\nExpected no. of rows:', len(kd_df) - , '\nExpected no. of cols:', len(kd_df.columns) - , '\n=============================================================') + #=============== + # writing file + #=============== + print('Writing file:' + , '\nFilename:', outputkdcsv +# , '\nPath:', outdir + , '\nExpected no. of rows:', len(kd_df) + , '\nExpected no. of cols:', len(kd_df.columns) + , '\n=============================================================') - kd_df.to_csv(outputkdcsv, header = True, index = True) - - #=============== - # plot: optional! - #=============== - # http://www.dalkescientific.com/writings/NBN/plotting.html + kd_df.to_csv(outputkdcsv, header = True, index = True) + + #=============== + # plot: optional! + #=============== + # http://www.dalkescientific.com/writings/NBN/plotting.html - # FIXME: save fig - # extract just pdb if from 'id' to pass to title of plot - # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) -# if doplot: - plot(kd_values, linewidth = 1.0) - #axis(xmin = 1, xmax = num_residues) - xlabel('Residue Number') - ylabel('Hydrophobicity') - title('K&D Hydrophobicity for ' + id) - show() - + # FIXME: save fig + # extract just pdb if from 'id' to pass to title of plot + # foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1) +# if doplot: + plot(kd_values, linewidth = 1.0) + #axis(xmin = 1, xmax = num_residues) + xlabel('Residue Number') + ylabel('Hydrophobicity') + title('K&D Hydrophobicity for ' + id) + show() + #%% end of function #======================================================================= #%% call function #kd_to_csv(infile, outfile, windowsize = 3) #======================================================================= def main(): - print('Running hydropathy calcs with following params\n' - , in_filename - , '\noutfile:', out_filename) - kd_to_csv(infile, outfile, 3) - print('Finished writing file:' - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\n=============================================================') - + print('Running hydropathy calcs with following params\n' + , in_filename + , '\noutfile:', out_filename) + kd_to_csv(infile, outfile, 3) + print('Finished writing file:' + , '\nFilename:', outfile + , '\n=============================================================') + if __name__ == '__main__': - main() -#%% end of script + main() +#%% end of script #======================================================================= diff --git a/scripts/rd_df.py b/scripts/rd_df.py index 50c84eb..6228b7b 100755 --- a/scripts/rd_df.py +++ b/scripts/rd_df.py @@ -31,10 +31,8 @@ os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') -arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name', default = None) # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output @@ -72,7 +70,7 @@ print('Output filename:', out_filename #======================================================================= #%% rd values from _rd.tsv values def rd_to_csv(inputtsv, outputrdcsv): - """ + """ Calculate kd (hydropathy values) from input fasta file @param inputtsv: tsv file downloaded from {INSERT LINK} @@ -83,76 +81,76 @@ def rd_to_csv(inputtsv, outputrdcsv): @return: none, writes rd values df as csv """ - #======================== - # read downloaded tsv file - #======================== - #%% Read input file - rd_data = pd.read_csv(inputtsv, sep = '\t') - print('Reading input file:', inputtsv - , '\nNo. of rows:', len(rd_data) - , '\nNo. of cols:', len(rd_data.columns)) + #======================== + # read downloaded tsv file + #======================== + #%% Read input file + rd_data = pd.read_csv(inputtsv, sep = '\t') + print('Reading input file:', inputtsv + , '\nNo. of rows:', len(rd_data) + , '\nNo. of cols:', len(rd_data.columns)) - print('Column names:', rd_data.columns - , '\n===============================================================') - #======================== - # creating position col - #======================== - # Extracting residue number from index and assigning - # the values to a column [position]. Then convert the position col to numeric. - rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values + print('Column names:', rd_data.columns + , '\n===============================================================') + #======================== + # creating position col + #======================== + # Extracting residue number from index and assigning + # the values to a column [position]. Then convert the position col to numeric. + rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values - # converting position to numeric - rd_data['position'] = pd.to_numeric(rd_data['position']) - rd_data['position'].dtype + # converting position to numeric + rd_data['position'] = pd.to_numeric(rd_data['position']) + rd_data['position'].dtype - print('Extracted residue num from index and assigned as a column:' - , '\ncolumn name: position' - , '\ntotal no. of cols now:', len(rd_data.columns) - , '\n=============================================================') + print('Extracted residue num from index and assigned as a column:' + , '\ncolumn name: position' + , '\ntotal no. of cols now:', len(rd_data.columns) + , '\n=============================================================') - #======================== - # Renaming amino-acid - # and all-atom cols - #======================== - print('Renaming columns:' - , '\ncolname==> # chain:residue: wt_3letter_caps' - , '\nYES... the column name *actually* contains a # ..!' - , '\ncolname==> all-atom: rd_values' - , '\n=============================================================') + #======================== + # Renaming amino-acid + # and all-atom cols + #======================== + print('Renaming columns:' + , '\ncolname==> # chain:residue: wt_3letter_caps' + , '\nYES... the column name *actually* contains a # ..!' + , '\ncolname==> all-atom: rd_values' + , '\n=============================================================') - rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True) - print('Column names:', rd_data.columns) + rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True) + print('Column names:', rd_data.columns) - #======================== - # extracting df with the - # desired columns - #======================== - print('Extracting relevant columns for writing df as csv') + #======================== + # extracting df with the + # desired columns + #======================== + print('Extracting relevant columns for writing df as csv') - rd_df = rd_data[['position','rd_values','wt_3letter_caps']] + rd_df = rd_data[['position','rd_values','wt_3letter_caps']] - if len(rd_df) == len(rd_data): - print('PASS: extracted df has expected no. of rows' - ,'\nExtracted df dim:' - ,'\nNo. of rows:', len(rd_df) - ,'\nNo. of cols:', len(rd_df.columns)) - else: - print('FAIL: no. of rows mimatch' - , '\nExpected no. of rows:', len(rd_data) - , '\nGot no. of rows:', len(rd_df) - , '\n=====================================================') + if len(rd_df) == len(rd_data): + print('PASS: extracted df has expected no. of rows' + ,'\nExtracted df dim:' + ,'\nNo. of rows:', len(rd_df) + ,'\nNo. of cols:', len(rd_df.columns)) + else: + print('FAIL: no. of rows mimatch' + , '\nExpected no. of rows:', len(rd_data) + , '\nGot no. of rows:', len(rd_df) + , '\n=====================================================') - #=============== - # writing file - #=============== - print('Writing file:' - , '\nFilename:', outputrdcsv -# , '\nPath:', outdir -# , '\nExpected no. of rows:', len(rd_df) -# , '\nExpected no. of cols:', len(rd_df.columns) - , '\n=========================================================') + #=============== + # writing file + #=============== + print('Writing file:' + , '\nFilename:', outputrdcsv +# , '\nPath:', outdir +# , '\nExpected no. of rows:', len(rd_df) +# , '\nExpected no. of cols:', len(rd_df.columns) + , '\n=========================================================') - rd_df.to_csv(outputrdcsv, header = True, index = False) + rd_df.to_csv(outputrdcsv, header = True, index = False) #%% end of function #======================================================================= @@ -160,16 +158,15 @@ def rd_to_csv(inputtsv, outputrdcsv): #rd_to_csv(infile, outfile) #======================================================================= def main(): - print('residue depth using the following params\n' - , in_filename - , '\noutfile:', out_filename) - rd_to_csv(infile, outfile) - print('Finished Writing file:' - , '\nFilename:', out_filename - , '\nPath:', outdir - , '\n=============================================================') + print('residue depth using the following params\n' + , in_filename + , '\noutfile:', out_filename) + rd_to_csv(infile, outfile) + print('Finished Writing file:' + , '\nFilename:', outfile + , '\n=============================================================') if __name__ == '__main__': - main() -#%% end of script + main() +#%% end of script #=======================================================================