updated results summary in the data_extraction.py

This commit is contained in:
Tanushree Tunstall 2020-11-12 17:05:29 +00:00
parent f9fd74812a
commit ccdd6029be

View file

@ -21,30 +21,22 @@ Created on Tue Aug 6 12:56:03 2019
# where each row is a separate mutation # where each row is a separate mutation
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
# NOTE
#drtype is renamed to 'resistance' in the 35k dataset
# output files: all lower case # output files: all lower case
# 0) <gene>_common_ids.csv # 1) <gene>_gwas.csv
# 1) <gene>_ambiguous_muts.csv # 2) <gene>_common_ids.csv
# 2) <gene>_mcsm_snps.csv # 3) <gene>_ambiguous_muts.csv
# 3) <gene>_metadata.csv # 4) <gene>_mcsm_snps.csv
# 4) <gene>_all_muts_msa.csv # 5) <gene>_metadata_poscounts.csv
# 5) <gene>_mutational_positons.csv # 6) <gene>_metadata.csv
# 7) <gene>_all_muts_msa.csv
# 8) <gene>_mutational_positons.csv
# FIXME #------------
## Make all cols lowercase # NOTE
## change WildPos: wild_pos #-----------
## Add an extra col: wild_chain_pos # drtype is renamed to 'resistance' in the 35k dataset
## output df: <gene>_linking_df.csv # all colnames in the ouput files lowercase
#containing the following cols
#1. Mutationinformation
#2. wild_type
#3. position
#4. mutant_type
#5. chain
#6. wild_pos
#7. wild_chain_pos
#------------- #-------------
# requires # requires
@ -66,9 +58,6 @@ os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts') os.chdir(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd() os.getcwd()
# Requires
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
from tidy_split import tidy_split
#======================================================================= #=======================================================================
#%% command line args #%% command line args
arg_parser = argparse.ArgumentParser() arg_parser = argparse.ArgumentParser()
@ -77,6 +66,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output') arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
@ -89,9 +79,10 @@ gene = args.gene
datadir = args.datadir datadir = args.datadir
indir = args.input_dir indir = args.input_dir
outdir = args.output_dir outdir = args.output_dir
make_dirs = args.make_dirs
#drug = 'pyrazinamide' #drug = 'ethambutol'
#gene = 'pncA' #gene = 'embB'
#%% input and output dirs and files #%% input and output dirs and files
#======= #=======
@ -106,19 +97,46 @@ if not indir:
if not outdir: if not outdir:
outdir = datadir + '/' + drug + '/output' outdir = datadir + '/' + drug + '/output'
if make_dirs:
print('make_dirs is turned on, creating data dir:', datadir)
try:
os.makedirs(datadir, exist_ok = True)
print("Directory '%s' created successfully" %datadir)
except OSError as error:
print("Directory '%s' can not be created")
print('make_dirs is turned on, creating indir:', indir)
try:
os.makedirs(indir, exist_ok = True)
print("Directory '%s' created successfully" %indir)
except OSError as error:
print("Directory '%s' can not be created")
print('make_dirs is turned on, creating outdir:', outdir)
try:
os.makedirs(outdir, exist_ok = True)
print("Directory '%s' created successfully" %outdir)
except OSError as error:
print("Directory '%s' can not be created")
# handle missing dirs here # handle missing dirs here
if not os.path.isdir(datadir): if not os.path.isdir(datadir):
print('ERROR: Data directory does not exist:', datadir print('ERROR: Data directory does not exist:', datadir
, '\nPlease create and ensure gwas data is present and then rerun') , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
sys.exit() sys.exit()
if not os.path.isdir(indir): if not os.path.isdir(indir):
print('ERROR: Input directory does not exist:', indir print('ERROR: Input directory does not exist:', indir
, '\nPlease either create or specify indir and rerun') , '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
sys.exit() sys.exit()
if not os.path.isdir(outdir): if not os.path.isdir(outdir):
print('ERROR: Output directory does not exist:', outdir print('ERROR: Output directory does not exist:', outdir
, '\nPlease create or specify outdir and rerun') , '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
sys.exit() sys.exit()
# Requires
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
from tidy_split import tidy_split
#======================================================================= #=======================================================================
gene_match = gene + '_p.' gene_match = gene + '_p.'
print('mut pattern for gene', gene, ':', gene_match) print('mut pattern for gene', gene, ':', gene_match)
@ -230,21 +248,17 @@ print('RESULT: Total samples:', total_samples
# counts NA per column # counts NA per column
meta_data.isna().sum() meta_data.isna().sum()
print('No. of NAs/column:' + '\n', meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum()
, '\n===========================================================') , '\n===========================================================\n')
#%% Write check file #%% Write check file
check_file = outdir + '/' + gene.lower() + '_gwas.csv' check_file = outdir + '/' + gene.lower() + '_gwas.csv'
meta_data.to_csv(check_file, index = False) meta_data.to_csv(check_file, index = False)
print('Writing subsetted gwas data' print('\n----------------------------------'
, '\nWriting subsetted gwas data:', gene
, '\n----------------------------------'
, '\nFile', check_file , '\nFile', check_file
, '\nDim:', meta_data.shape) , '\nDim:', meta_data.shape)
# glance
#meta_data.head()
#total_samples - NA pyrazinamide = ?
# 19K: 19265-6754 = 12511
# 33K: 33681 - 23823 = 9858
# equivalent of table in R # equivalent of table in R
# drug counts: complete samples for OR calcs # drug counts: complete samples for OR calcs
meta_data[drug].value_counts() meta_data[drug].value_counts()
@ -309,7 +323,7 @@ print('=================================================================')
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
#======== #========
# Second: counting <gene_match> mutations in dr_muts_col column # Second: counting <gene_match> mutations in other_muts_col column
#======== #========
print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col) print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)
# drop na and extract a clean df # drop na and extract a clean df
@ -355,7 +369,7 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
expected_rows = dr_gene_count + other_gene_count expected_rows = dr_gene_count + other_gene_count
#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt) #del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt ) del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
#%% #%%
############ ############
@ -398,7 +412,7 @@ meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_mat
print('gene_snp_match in dr:', len(meta_gene_dr)) print('gene_snp_match in dr:', len(meta_gene_dr))
dr_id = meta_gene_dr['id'].unique() dr_id = meta_gene_dr['id'].unique()
print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
if len(id_dr) == len(meta_gene_dr): if len(id_dr) == len(meta_gene_dr):
print('PASS: lengths match' print('PASS: lengths match'
@ -448,7 +462,7 @@ meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contai
print('gene_snp_match in other:', len(meta_gene_other)) print('gene_snp_match in other:', len(meta_gene_other))
other_id = meta_gene_other['id'].unique() other_id = meta_gene_other['id'].unique()
print('RESULT: No. of samples with other muts:', len(other_id)) print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
if len(id_other) == len(meta_gene_other): if len(id_other) == len(meta_gene_other):
print('PASS: lengths match' print('PASS: lengths match'
@ -485,6 +499,7 @@ common_ids2.columns = ['index', 'id2']
# should be True # should be True
if common_ids['id'].equals(common_ids2['id2']): if common_ids['id'].equals(common_ids2['id2']):
print('PASS: Further cross checks on common ids') print('PASS: Further cross checks on common ids')
nu_common_ids = common_ids.nunique()
else: else:
sys.exit('FAIL: Further cross checks on common ids') sys.exit('FAIL: Further cross checks on common ids')
@ -496,8 +511,9 @@ print('Expected no. of gene samples:', expected_gene_samples
#print(outdir) #print(outdir)
out_filename_cid = gene.lower() + '_common_ids.csv' out_filename_cid = gene.lower() + '_common_ids.csv'
outfile_cid = outdir + '/' + out_filename_cid outfile_cid = outdir + '/' + out_filename_cid
print('\n----------------------------------'
print('Writing file:' '\nWriting file: common_ids'
'\n----------------------------------'
, '\nFile:', outfile_cid , '\nFile:', outfile_cid
, '\nNo. of rows:', len(common_ids) , '\nNo. of rows:', len(common_ids)
, '\n=============================================================') , '\n=============================================================')
@ -517,7 +533,7 @@ print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ] meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
extracted_gene_samples = meta_gene_all['id'].nunique() extracted_gene_samples = meta_gene_all['id'].nunique()
print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
, '\n===================================================================') , '\n===================================================================')
# sanity check: length of gene samples # sanity check: length of gene samples
@ -774,7 +790,6 @@ print('Length of dfs pre and post processing...'
, '\ngene LF data (unique mutation in each row):', len(gene_LF1) , '\ngene LF data (unique mutation in each row):', len(gene_LF1)
, '\n=============================================================') , '\n=============================================================')
#%% sanity check for extraction #%% sanity check for extraction
# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows' # This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
print('Verifying whether extraction process worked correctly...') print('Verifying whether extraction process worked correctly...')
@ -865,7 +880,11 @@ else:
, '\n=========================================================') , '\n=========================================================')
#%% clear variables #%% clear variables
del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na) del(id_dr, id_other
#, meta_data
, meta_gene_dr, meta_gene_other
#, mut_grouped
, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)
del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1) del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)
@ -877,7 +896,9 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
print('Writing file: ambiguous muts' print('\n----------------------------------'
, '\nWriting file: ambiguous muts'
, '\n----------------------------------'
, '\nFilename:', outfile_ambig_muts) , '\nFilename:', outfile_ambig_muts)
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
@ -1163,7 +1184,9 @@ else:
out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv' out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
print('Writing file: mCSM style muts' print('\n----------------------------------'
, '\nWriting file: mCSM style muts'
, '\n----------------------------------'
, '\nFile:', outfile_mcsmsnps , '\nFile:', outfile_mcsmsnps
, '\nmutation format (SNP): {WT}<POS>{MUT}' , '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only) , '\nNo. of distinct muts:', len(snps_only)
@ -1191,7 +1214,9 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T
# where each row has UNIQUE mutations NOT unique sample ids # where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('Writing file: Metadata poscounts' print('\n----------------------------------'
, 'Writing file: Metadata poscounts'
, '\n----------------------------------'
, '\nFile:', outfile_metadata_poscounts , '\nFile:', outfile_metadata_poscounts
, '\n============================================================') , '\n============================================================')
@ -1205,9 +1230,12 @@ del(out_filename_metadata_poscounts)
#%% Write file: gene_metadata (i.e gene_LF1) #%% Write file: gene_metadata (i.e gene_LF1)
# where each row has UNIQUE mutations NOT unique sample ids # where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata = gene.lower() + '_metadata_raw.csv' #out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
out_filename_metadata = gene.lower() + '_metadata.csv'
outfile_metadata = outdir + '/' + out_filename_metadata outfile_metadata = outdir + '/' + out_filename_metadata
print('Writing file: LF formatted data' print('\n----------------------------------'
, '\nWriting file: LF formatted data'
, '\n----------------------------------'
, '\nFile:', outfile_metadata , '\nFile:', outfile_metadata
, '\n============================================================') , '\n============================================================')
@ -1248,10 +1276,12 @@ else:
out_filename_msa = gene.lower() +'_all_muts_msa.csv' out_filename_msa = gene.lower() +'_all_muts_msa.csv'
outfile_msa = outdir + '/' + out_filename_msa outfile_msa = outdir + '/' + out_filename_msa
print('Writing file: mCSM style muts for msa', print('\n----------------------------------'
'\nFile:', outfile_msa, , '\nWriting file: mCSM style muts for msa'
'\nmutation format (SNP): {WT}<POS>{MUT}', , '\n----------------------------------'
'\nNo.of lines of msa:', len(all_muts_msa)) , '\nFile:', outfile_msa
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo.of lines of msa:', len(all_muts_msa))
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
@ -1278,7 +1308,9 @@ pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
out_filename_pos = gene.lower() + '_mutational_positons.csv' out_filename_pos = gene.lower() + '_mutational_positons.csv'
outfile_pos = outdir + '/' + out_filename_pos outfile_pos = outdir + '/' + out_filename_pos
print('Writing file: mutational positions' print('\n----------------------------------'
, 'Writing file: mutational positions'
, '\n----------------------------------'
, '\nFile:', outfile_pos , '\nFile:', outfile_pos
, '\nNo. of distinct positions:', len(pos_only_sorted) , '\nNo. of distinct positions:', len(pos_only_sorted)
, '\n=============================================================') , '\n=============================================================')
@ -1293,15 +1325,39 @@ print('Finished writing:', outfile_pos
del(out_filename_pos) del(out_filename_pos)
#%% quick summary output #%% quick summary output
#print('============================================'
# , '\nQuick summary output for', drug, 'and' , gene.lower()
# , '\n============================================'
# , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
# , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
# , '\nTotal no. of samples with missense muts:', len(gene_LF1)
# , '\n============================================================='
# , '\n\n\n')
print('============================================' print('============================================'
, '\nQuick summary output for', gene.lower() , '\nQuick summary output for', drug, 'and' , gene.lower()
, '\n============================================' , '\n============================================'
, '\nTotal samples:', total_samples
, '\nNo. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
, '\n'
, '\nPercentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
, '\n'
, '\nTotal no.of unique dr muts:', mut_grouped[dr_muts_col]
, '\nTotal no.of unique other muts:', mut_grouped[other_muts_col]
, '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique() , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
, '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique() , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
, '\nTotal no. of samples with missense muts:', len(gene_LF1) , '\nTotal no. of samples with missense muts:', len(gene_LF1)
, '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()
, '\n'
, '\nTotal no.of samples with common_ids:', nu_common_ids['id']
, '\nTotal no.of samples with ambiguous muts:', len(inspect)
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
, '\n=============================================================' , '\n============================================================='
, '\n\n\n') , '\n\n\n')
#======================================================================= #=======================================================================
print(u'\u2698' * 50, print(u'\u2698' * 50,
'\nEnd of script: Data extraction and writing files' '\nEnd of script: Data extraction and writing files'