updated results summary in the data_extraction.py
This commit is contained in:
parent
f9fd74812a
commit
ccdd6029be
1 changed files with 114 additions and 58 deletions
|
@ -21,30 +21,22 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
# where each row is a separate mutation
|
||||
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
|
||||
|
||||
# NOTE
|
||||
#drtype is renamed to 'resistance' in the 35k dataset
|
||||
|
||||
# output files: all lower case
|
||||
# 0) <gene>_common_ids.csv
|
||||
# 1) <gene>_ambiguous_muts.csv
|
||||
# 2) <gene>_mcsm_snps.csv
|
||||
# 3) <gene>_metadata.csv
|
||||
# 4) <gene>_all_muts_msa.csv
|
||||
# 5) <gene>_mutational_positons.csv
|
||||
# 1) <gene>_gwas.csv
|
||||
# 2) <gene>_common_ids.csv
|
||||
# 3) <gene>_ambiguous_muts.csv
|
||||
# 4) <gene>_mcsm_snps.csv
|
||||
# 5) <gene>_metadata_poscounts.csv
|
||||
# 6) <gene>_metadata.csv
|
||||
# 7) <gene>_all_muts_msa.csv
|
||||
# 8) <gene>_mutational_positons.csv
|
||||
|
||||
# FIXME
|
||||
## Make all cols lowercase
|
||||
## change WildPos: wild_pos
|
||||
## Add an extra col: wild_chain_pos
|
||||
## output df: <gene>_linking_df.csv
|
||||
#containing the following cols
|
||||
#1. Mutationinformation
|
||||
#2. wild_type
|
||||
#3. position
|
||||
#4. mutant_type
|
||||
#5. chain
|
||||
#6. wild_pos
|
||||
#7. wild_chain_pos
|
||||
#------------
|
||||
# NOTE
|
||||
#-----------
|
||||
# drtype is renamed to 'resistance' in the 35k dataset
|
||||
# all colnames in the ouput files lowercase
|
||||
|
||||
#-------------
|
||||
# requires
|
||||
|
@ -66,9 +58,6 @@ os.getcwd()
|
|||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
|
||||
# Requires
|
||||
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
||||
from tidy_split import tidy_split
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
@ -77,6 +66,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
|
|||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
|
||||
|
||||
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
||||
|
||||
|
@ -89,9 +79,10 @@ gene = args.gene
|
|||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
make_dirs = args.make_dirs
|
||||
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
#drug = 'ethambutol'
|
||||
#gene = 'embB'
|
||||
|
||||
#%% input and output dirs and files
|
||||
#=======
|
||||
|
@ -106,19 +97,46 @@ if not indir:
|
|||
if not outdir:
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
if make_dirs:
|
||||
print('make_dirs is turned on, creating data dir:', datadir)
|
||||
try:
|
||||
os.makedirs(datadir, exist_ok = True)
|
||||
print("Directory '%s' created successfully" %datadir)
|
||||
except OSError as error:
|
||||
print("Directory '%s' can not be created")
|
||||
|
||||
print('make_dirs is turned on, creating indir:', indir)
|
||||
try:
|
||||
os.makedirs(indir, exist_ok = True)
|
||||
print("Directory '%s' created successfully" %indir)
|
||||
except OSError as error:
|
||||
print("Directory '%s' can not be created")
|
||||
|
||||
print('make_dirs is turned on, creating outdir:', outdir)
|
||||
try:
|
||||
os.makedirs(outdir, exist_ok = True)
|
||||
print("Directory '%s' created successfully" %outdir)
|
||||
except OSError as error:
|
||||
print("Directory '%s' can not be created")
|
||||
|
||||
# handle missing dirs here
|
||||
if not os.path.isdir(datadir):
|
||||
print('ERROR: Data directory does not exist:', datadir
|
||||
, '\nPlease create and ensure gwas data is present and then rerun')
|
||||
, '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
|
||||
sys.exit()
|
||||
if not os.path.isdir(indir):
|
||||
print('ERROR: Input directory does not exist:', indir
|
||||
, '\nPlease either create or specify indir and rerun')
|
||||
, '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
|
||||
sys.exit()
|
||||
if not os.path.isdir(outdir):
|
||||
print('ERROR: Output directory does not exist:', outdir
|
||||
, '\nPlease create or specify outdir and rerun')
|
||||
, '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
|
||||
sys.exit()
|
||||
|
||||
# Requires
|
||||
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
||||
from tidy_split import tidy_split
|
||||
|
||||
#=======================================================================
|
||||
gene_match = gene + '_p.'
|
||||
print('mut pattern for gene', gene, ':', gene_match)
|
||||
|
@ -230,21 +248,17 @@ print('RESULT: Total samples:', total_samples
|
|||
# counts NA per column
|
||||
meta_data.isna().sum()
|
||||
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
||||
, '\n===========================================================')
|
||||
, '\n===========================================================\n')
|
||||
|
||||
#%% Write check file
|
||||
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
|
||||
meta_data.to_csv(check_file, index = False)
|
||||
print('Writing subsetted gwas data'
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting subsetted gwas data:', gene
|
||||
, '\n----------------------------------'
|
||||
, '\nFile', check_file
|
||||
, '\nDim:', meta_data.shape)
|
||||
|
||||
# glance
|
||||
#meta_data.head()
|
||||
#total_samples - NA pyrazinamide = ?
|
||||
# 19K: 19265-6754 = 12511
|
||||
# 33K: 33681 - 23823 = 9858
|
||||
|
||||
# equivalent of table in R
|
||||
# drug counts: complete samples for OR calcs
|
||||
meta_data[drug].value_counts()
|
||||
|
@ -309,7 +323,7 @@ print('=================================================================')
|
|||
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
||||
|
||||
#========
|
||||
# Second: counting <gene_match> mutations in dr_muts_col column
|
||||
# Second: counting <gene_match> mutations in other_muts_col column
|
||||
#========
|
||||
print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)
|
||||
# drop na and extract a clean df
|
||||
|
@ -355,7 +369,7 @@ print('Predicting total no. of rows in the curated df:', dr_gene_count + other_g
|
|||
expected_rows = dr_gene_count + other_gene_count
|
||||
|
||||
#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt)
|
||||
del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other,count_wt )
|
||||
del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
|
||||
|
||||
#%%
|
||||
############
|
||||
|
@ -398,7 +412,7 @@ meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_mat
|
|||
print('gene_snp_match in dr:', len(meta_gene_dr))
|
||||
|
||||
dr_id = meta_gene_dr['id'].unique()
|
||||
print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
|
||||
print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
|
||||
|
||||
if len(id_dr) == len(meta_gene_dr):
|
||||
print('PASS: lengths match'
|
||||
|
@ -448,7 +462,7 @@ meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contai
|
|||
print('gene_snp_match in other:', len(meta_gene_other))
|
||||
|
||||
other_id = meta_gene_other['id'].unique()
|
||||
print('RESULT: No. of samples with other muts:', len(other_id))
|
||||
print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
|
||||
|
||||
if len(id_other) == len(meta_gene_other):
|
||||
print('PASS: lengths match'
|
||||
|
@ -485,6 +499,7 @@ common_ids2.columns = ['index', 'id2']
|
|||
# should be True
|
||||
if common_ids['id'].equals(common_ids2['id2']):
|
||||
print('PASS: Further cross checks on common ids')
|
||||
nu_common_ids = common_ids.nunique()
|
||||
else:
|
||||
sys.exit('FAIL: Further cross checks on common ids')
|
||||
|
||||
|
@ -496,8 +511,9 @@ print('Expected no. of gene samples:', expected_gene_samples
|
|||
#print(outdir)
|
||||
out_filename_cid = gene.lower() + '_common_ids.csv'
|
||||
outfile_cid = outdir + '/' + out_filename_cid
|
||||
|
||||
print('Writing file:'
|
||||
print('\n----------------------------------'
|
||||
'\nWriting file: common_ids'
|
||||
'\n----------------------------------'
|
||||
, '\nFile:', outfile_cid
|
||||
, '\nNo. of rows:', len(common_ids)
|
||||
, '\n=============================================================')
|
||||
|
@ -517,7 +533,7 @@ print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
|
|||
meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
|
||||
|
||||
extracted_gene_samples = meta_gene_all['id'].nunique()
|
||||
print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
|
||||
print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
|
||||
, '\n===================================================================')
|
||||
|
||||
# sanity check: length of gene samples
|
||||
|
@ -774,7 +790,6 @@ print('Length of dfs pre and post processing...'
|
|||
, '\ngene LF data (unique mutation in each row):', len(gene_LF1)
|
||||
, '\n=============================================================')
|
||||
|
||||
|
||||
#%% sanity check for extraction
|
||||
# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
|
||||
print('Verifying whether extraction process worked correctly...')
|
||||
|
@ -865,7 +880,11 @@ else:
|
|||
, '\n=========================================================')
|
||||
|
||||
#%% clear variables
|
||||
del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)
|
||||
del(id_dr, id_other
|
||||
#, meta_data
|
||||
, meta_gene_dr, meta_gene_other
|
||||
#, mut_grouped
|
||||
, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)
|
||||
|
||||
del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)
|
||||
|
||||
|
@ -877,7 +896,9 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
|
|||
|
||||
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
||||
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
||||
print('Writing file: ambiguous muts'
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: ambiguous muts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFilename:', outfile_ambig_muts)
|
||||
|
||||
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||
|
@ -1163,7 +1184,9 @@ else:
|
|||
out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
|
||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||
|
||||
print('Writing file: mCSM style muts'
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_mcsmsnps
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo. of distinct muts:', len(snps_only)
|
||||
|
@ -1191,7 +1214,9 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T
|
|||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('Writing file: Metadata poscounts'
|
||||
print('\n----------------------------------'
|
||||
, 'Writing file: Metadata poscounts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata_poscounts
|
||||
, '\n============================================================')
|
||||
|
||||
|
@ -1205,9 +1230,12 @@ del(out_filename_metadata_poscounts)
|
|||
|
||||
#%% Write file: gene_metadata (i.e gene_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata = gene.lower() + '_metadata_raw.csv'
|
||||
#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
|
||||
out_filename_metadata = gene.lower() + '_metadata.csv'
|
||||
outfile_metadata = outdir + '/' + out_filename_metadata
|
||||
print('Writing file: LF formatted data'
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: LF formatted data'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata
|
||||
, '\n============================================================')
|
||||
|
||||
|
@ -1248,10 +1276,12 @@ else:
|
|||
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
|
||||
outfile_msa = outdir + '/' + out_filename_msa
|
||||
|
||||
print('Writing file: mCSM style muts for msa',
|
||||
'\nFile:', outfile_msa,
|
||||
'\nmutation format (SNP): {WT}<POS>{MUT}',
|
||||
'\nNo.of lines of msa:', len(all_muts_msa))
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts for msa'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_msa
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo.of lines of msa:', len(all_muts_msa))
|
||||
|
||||
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
|
||||
|
||||
|
@ -1278,7 +1308,9 @@ pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
|||
out_filename_pos = gene.lower() + '_mutational_positons.csv'
|
||||
outfile_pos = outdir + '/' + out_filename_pos
|
||||
|
||||
print('Writing file: mutational positions'
|
||||
print('\n----------------------------------'
|
||||
, 'Writing file: mutational positions'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_pos
|
||||
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
||||
, '\n=============================================================')
|
||||
|
@ -1293,15 +1325,39 @@ print('Finished writing:', outfile_pos
|
|||
|
||||
del(out_filename_pos)
|
||||
#%% quick summary output
|
||||
#print('============================================'
|
||||
# , '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||
# , '\n============================================'
|
||||
# , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
|
||||
# , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
|
||||
# , '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
||||
# , '\n============================================================='
|
||||
# , '\n\n\n')
|
||||
|
||||
print('============================================'
|
||||
, '\nQuick summary output for', gene.lower()
|
||||
, '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||
, '\n============================================'
|
||||
, '\nTotal samples:', total_samples
|
||||
, '\nNo. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
|
||||
, '\n'
|
||||
, '\nPercentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||
, '\n'
|
||||
, '\nTotal no.of unique dr muts:', mut_grouped[dr_muts_col]
|
||||
, '\nTotal no.of unique other muts:', mut_grouped[other_muts_col]
|
||||
, '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
|
||||
, '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
|
||||
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
||||
, '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()
|
||||
, '\n'
|
||||
, '\nTotal no.of samples with common_ids:', nu_common_ids['id']
|
||||
, '\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
||||
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
||||
, '\n============================================================='
|
||||
, '\n\n\n')
|
||||
|
||||
|
||||
|
||||
#=======================================================================
|
||||
print(u'\u2698' * 50,
|
||||
'\nEnd of script: Data extraction and writing files'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue