diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 15f35e0..98a4415 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -21,30 +21,22 @@ Created on Tue Aug 6 12:56:03 2019 # where each row is a separate mutation # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique -# NOTE -#drtype is renamed to 'resistance' in the 35k dataset # output files: all lower case -# 0) _common_ids.csv -# 1) _ambiguous_muts.csv -# 2) _mcsm_snps.csv -# 3) _metadata.csv -# 4) _all_muts_msa.csv -# 5) _mutational_positons.csv +# 1) _gwas.csv +# 2) _common_ids.csv +# 3) _ambiguous_muts.csv +# 4) _mcsm_snps.csv +# 5) _metadata_poscounts.csv +# 6) _metadata.csv +# 7) _all_muts_msa.csv +# 8) _mutational_positons.csv -# FIXME -## Make all cols lowercase -## change WildPos: wild_pos -## Add an extra col: wild_chain_pos -## output df: _linking_df.csv -#containing the following cols -#1. Mutationinformation -#2. wild_type -#3. position -#4. mutant_type -#5. chain -#6. wild_pos -#7. wild_chain_pos +#------------ +# NOTE +#----------- +# drtype is renamed to 'resistance' in the 35k dataset +# all colnames in the ouput files lowercase #------------- # requires @@ -66,9 +58,6 @@ os.getcwd() os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() -# Requires -from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! -from tidy_split import tidy_split #======================================================================= #%% command line args arg_parser = argparse.ArgumentParser() @@ -77,6 +66,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') @@ -89,9 +79,10 @@ gene = args.gene datadir = args.datadir indir = args.input_dir outdir = args.output_dir +make_dirs = args.make_dirs -#drug = 'pyrazinamide' -#gene = 'pncA' +#drug = 'ethambutol' +#gene = 'embB' #%% input and output dirs and files #======= @@ -106,19 +97,46 @@ if not indir: if not outdir: outdir = datadir + '/' + drug + '/output' +if make_dirs: + print('make_dirs is turned on, creating data dir:', datadir) + try: + os.makedirs(datadir, exist_ok = True) + print("Directory '%s' created successfully" %datadir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating indir:', indir) + try: + os.makedirs(indir, exist_ok = True) + print("Directory '%s' created successfully" %indir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating outdir:', outdir) + try: + os.makedirs(outdir, exist_ok = True) + print("Directory '%s' created successfully" %outdir) + except OSError as error: + print("Directory '%s' can not be created") + # handle missing dirs here if not os.path.isdir(datadir): print('ERROR: Data directory does not exist:', datadir - , '\nPlease create and ensure gwas data is present and then rerun') + , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs') sys.exit() if not os.path.isdir(indir): print('ERROR: Input directory does not exist:', indir - , '\nPlease either create or specify indir and rerun') + , '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs') sys.exit() if not os.path.isdir(outdir): print('ERROR: Output directory does not exist:', outdir - , '\nPlease create or specify outdir and rerun') + , '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs') sys.exit() + +# Requires +from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +from tidy_split import tidy_split + #======================================================================= gene_match = gene + '_p.' print('mut pattern for gene', gene, ':', gene_match) @@ -230,21 +248,17 @@ print('RESULT: Total samples:', total_samples # counts NA per column meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum() - , '\n===========================================================') + , '\n===========================================================\n') #%% Write check file check_file = outdir + '/' + gene.lower() + '_gwas.csv' meta_data.to_csv(check_file, index = False) -print('Writing subsetted gwas data' +print('\n----------------------------------' + , '\nWriting subsetted gwas data:', gene + , '\n----------------------------------' , '\nFile', check_file , '\nDim:', meta_data.shape) -# glance -#meta_data.head() -#total_samples - NA pyrazinamide = ? -# 19K: 19265-6754 = 12511 -# 33K: 33681 - 23823 = 9858 - # equivalent of table in R # drug counts: complete samples for OR calcs meta_data[drug].value_counts() @@ -309,7 +323,7 @@ print('=================================================================') del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) #======== -# Second: counting mutations in dr_muts_col column +# Second: counting mutations in other_muts_col column #======== print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col) # drop na and extract a clean df @@ -355,7 +369,7 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g expected_rows = dr_gene_count + other_gene_count #del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt) -del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt ) +del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt ) #%% ############ @@ -398,7 +412,7 @@ meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_mat print('gene_snp_match in dr:', len(meta_gene_dr)) dr_id = meta_gene_dr['id'].unique() -print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) +print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id)) if len(id_dr) == len(meta_gene_dr): print('PASS: lengths match' @@ -448,7 +462,7 @@ meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contai print('gene_snp_match in other:', len(meta_gene_other)) other_id = meta_gene_other['id'].unique() -print('RESULT: No. of samples with other muts:', len(other_id)) +print('RESULT: No. of samples with other muts in', gene, ':', len(other_id)) if len(id_other) == len(meta_gene_other): print('PASS: lengths match' @@ -485,6 +499,7 @@ common_ids2.columns = ['index', 'id2'] # should be True if common_ids['id'].equals(common_ids2['id2']): print('PASS: Further cross checks on common ids') + nu_common_ids = common_ids.nunique() else: sys.exit('FAIL: Further cross checks on common ids') @@ -496,8 +511,9 @@ print('Expected no. of gene samples:', expected_gene_samples #print(outdir) out_filename_cid = gene.lower() + '_common_ids.csv' outfile_cid = outdir + '/' + out_filename_cid - -print('Writing file:' +print('\n----------------------------------' + '\nWriting file: common_ids' + '\n----------------------------------' , '\nFile:', outfile_cid , '\nNo. of rows:', len(common_ids) , '\n=============================================================') @@ -517,7 +533,7 @@ print('Extracting nsSNP match:', gene, 'mutations from cols:\n' meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ] extracted_gene_samples = meta_gene_all['id'].nunique() -print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples +print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples , '\n===================================================================') # sanity check: length of gene samples @@ -773,8 +789,7 @@ print('Length of dfs pre and post processing...' , '\ngene WF data (unique samples in each row):', extracted_gene_samples , '\ngene LF data (unique mutation in each row):', len(gene_LF1) , '\n=============================================================') - - + #%% sanity check for extraction # This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows' print('Verifying whether extraction process worked correctly...') @@ -865,7 +880,11 @@ else: , '\n=========================================================') #%% clear variables -del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na) +del(id_dr, id_other + #, meta_data + , meta_gene_dr, meta_gene_other + #, mut_grouped + , muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na) del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1) @@ -877,7 +896,9 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts -print('Writing file: ambiguous muts' +print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' , '\nFilename:', outfile_ambig_muts) inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] @@ -1163,7 +1184,9 @@ else: out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv' outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps -print('Writing file: mCSM style muts' +print('\n----------------------------------' + , '\nWriting file: mCSM style muts' + , '\n----------------------------------' , '\nFile:', outfile_mcsmsnps , '\nmutation format (SNP): {WT}{MUT}' , '\nNo. of distinct muts:', len(snps_only) @@ -1191,7 +1214,9 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T # where each row has UNIQUE mutations NOT unique sample ids out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts -print('Writing file: Metadata poscounts' +print('\n----------------------------------' + , 'Writing file: Metadata poscounts' + , '\n----------------------------------' , '\nFile:', outfile_metadata_poscounts , '\n============================================================') @@ -1205,9 +1230,12 @@ del(out_filename_metadata_poscounts) #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids -out_filename_metadata = gene.lower() + '_metadata_raw.csv' +#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale? +out_filename_metadata = gene.lower() + '_metadata.csv' outfile_metadata = outdir + '/' + out_filename_metadata -print('Writing file: LF formatted data' +print('\n----------------------------------' + , '\nWriting file: LF formatted data' + , '\n----------------------------------' , '\nFile:', outfile_metadata , '\n============================================================') @@ -1248,10 +1276,12 @@ else: out_filename_msa = gene.lower() +'_all_muts_msa.csv' outfile_msa = outdir + '/' + out_filename_msa -print('Writing file: mCSM style muts for msa', - '\nFile:', outfile_msa, - '\nmutation format (SNP): {WT}{MUT}', - '\nNo.of lines of msa:', len(all_muts_msa)) +print('\n----------------------------------' + , '\nWriting file: mCSM style muts for msa' + , '\n----------------------------------' + , '\nFile:', outfile_msa + , '\nmutation format (SNP): {WT}{MUT}' + , '\nNo.of lines of msa:', len(all_muts_msa)) all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) @@ -1278,7 +1308,9 @@ pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) out_filename_pos = gene.lower() + '_mutational_positons.csv' outfile_pos = outdir + '/' + out_filename_pos -print('Writing file: mutational positions' +print('\n----------------------------------' + , 'Writing file: mutational positions' + , '\n----------------------------------' , '\nFile:', outfile_pos , '\nNo. of distinct positions:', len(pos_only_sorted) , '\n=============================================================') @@ -1293,15 +1325,39 @@ print('Finished writing:', outfile_pos del(out_filename_pos) #%% quick summary output +#print('============================================' +# , '\nQuick summary output for', drug, 'and' , gene.lower() +# , '\n============================================' +# , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique() +# , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique() +# , '\nTotal no. of samples with missense muts:', len(gene_LF1) +# , '\n=============================================================' +# , '\n\n\n') + print('============================================' - , '\nQuick summary output for', gene.lower() + , '\nQuick summary output for', drug, 'and' , gene.lower() , '\n============================================' + , '\nTotal samples:', total_samples + , '\nNo. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() + , '\n' + , '\nPercentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , '\n' + , '\nTotal no.of unique dr muts:', mut_grouped[dr_muts_col] + , '\nTotal no.of unique other muts:', mut_grouped[other_muts_col] , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique() , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique() , '\nTotal no. of samples with missense muts:', len(gene_LF1) + , '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique() + , '\n' + , '\nTotal no.of samples with common_ids:', nu_common_ids['id'] + , '\nTotal no.of samples with ambiguous muts:', len(inspect) + #, '\nTotal no.of unique ambiguous muts:', len(common_muts) + , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() , '\n=============================================================' , '\n\n\n') + + #======================================================================= print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files'