diff --git a/scripts/combine_dfs.py b/scripts/combine_dfs.py index ea5e1e3..261e514 100755 --- a/scripts/combine_dfs.py +++ b/scripts/combine_dfs.py @@ -7,29 +7,29 @@ Created on Tue Aug 6 12:56:03 2019 ''' # FIXME: change filename 4 (mcsm normalised data) # to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline -#============================================================================= +#======================================================================= # Task: combine 4 dfs with aa position as linking column # This is done in 2 steps: # merge 1: of 3 dfs (filenames in lowercase) # _dssp.csv # _kd.csv -# _pnca_rd.csv +# _rd.csv # merge 2: of 2 dfs -# pnca_complex_mcsm_norm.csv (!fix name) +# gene.lower() + '_complex_mcsm_norm.csv' (!fix name) # output df from merge1 # Input: 3 dfs # _dssp.csv # _kd.csv -# _pnca_rd.csv -# pnca_complex_mcsm_norm.csv (!fix name) +# _rd.csv +# gene.lower() + '_complex_mcsm_norm.csv' (!fix name) # Output: .csv of all 4 dfs combined # useful link # https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns -#============================================================================= +#======================================================================= #%% load packages import sys, os import pandas as pd @@ -46,10 +46,10 @@ os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') +#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output @@ -68,11 +68,11 @@ datadir = homedir + '/' + 'git/Data' # input #======= indir = datadir + '/' + drug + '/' + 'output' -in_filename1 = 'pnca_dssp.csv' -in_filename2 = 'pnca_kd.csv' -in_filename3 = 'pnca_rd.csv' +in_filename1 = gene.lower() + '_dssp.csv' +in_filename2 = gene.lower() + '_kd.csv' +in_filename3 = gene.lower() + '_rd.csv' #in_filename4 = 'mcsm_complex1_normalised.csv' # FIXNAME -in_filename4 = 'pnca_complex_mcsm_norm.csv' +in_filename4 = gene.lower() + '_complex_mcsm_norm.csv' infile1 = indir + '/' + in_filename1 infile2 = indir + '/' + in_filename2 diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 451d6cf..863ab67 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -47,8 +47,10 @@ from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output paths & filenames @@ -346,7 +348,7 @@ print('expected no. of gene samples:', expected_gene_samples) print('=================================================================') #%% write file #print(outdir) -out_filename0 = gene.lower() + '_' + 'common_ids.csv' +out_filename0 = gene.lower() + '_common_ids.csv' outfile0 = outdir + '/' + out_filename0 #FIXME: CHECK line len(common_ids) @@ -741,7 +743,7 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m #dr_muts.to_csv('dr_muts.csv', header = True) #other_muts.to_csv('other_muts.csv', header = True) -out_filename1 = gene.lower() + '_' + 'ambiguous_muts.csv' +out_filename1 = gene.lower() + '_ambiguous_muts.csv' outfile1 = outdir + '/' + out_filename1 print('Writing file: ambiguous muts', '\nFilename:', out_filename1, @@ -1053,7 +1055,7 @@ else: , '\nDebug please!' , '\n=========================================================') -out_filename2 = gene.lower() + '_' + 'mcsm_snps.csv' +out_filename2 = gene.lower() + '_mcsm_snps.csv' outfile2 = outdir + '/' + out_filename2 print('Writing file: mCSM style muts' @@ -1074,7 +1076,7 @@ del(out_filename2) #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids -out_filename3 = gene.lower() + '_' + 'metadata.csv' +out_filename3 = gene.lower() + '_metadata.csv' outfile3 = outdir + '/' + out_filename3 print('Writing file: LF formatted data' , '\nFilename:', out_filename3 @@ -1117,7 +1119,7 @@ else: , '\nDebug please!' , '\n=========================================================') -out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv' +out_filename4 = gene.lower() +'_all_muts_msa.csv' outfile4 = outdir + '/' + out_filename4 print('Writing file: mCSM style muts for msa', @@ -1149,7 +1151,7 @@ pos_only.position.dtype # sort by position value pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) -out_filename5 = gene.lower() + '_' + 'mutational_positons.csv' +out_filename5 = gene.lower() + '_mutational_positons.csv' outfile5 = outdir + '/' + out_filename5 print('Writing file: mutational positions' diff --git a/scripts/dssp_df.py b/scripts/dssp_df.py index ff97aa7..c2d8795 100755 --- a/scripts/dssp_df.py +++ b/scripts/dssp_df.py @@ -30,8 +30,10 @@ os.getcwd() #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') +#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output diff --git a/scripts/rd_df.py b/scripts/rd_df.py index 3b5ad1f..50c84eb 100755 --- a/scripts/rd_df.py +++ b/scripts/rd_df.py @@ -33,8 +33,8 @@ os.getcwd() arg_parser = argparse.ArgumentParser() #arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') #arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'DRUGNAME') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'geneName') +arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG') +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive args = arg_parser.parse_args() #======================================================================= #%% variable assignment: input and output