updated results summary in the data_extraction.py
This commit is contained in:
parent
e67fbfd986
commit
073381e5a2
1 changed files with 114 additions and 58 deletions
|
@ -21,30 +21,22 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
# where each row is a separate mutation
|
# where each row is a separate mutation
|
||||||
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
|
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
|
||||||
|
|
||||||
# NOTE
|
|
||||||
#drtype is renamed to 'resistance' in the 35k dataset
|
|
||||||
|
|
||||||
# output files: all lower case
|
# output files: all lower case
|
||||||
# 0) <gene>_common_ids.csv
|
# 1) <gene>_gwas.csv
|
||||||
# 1) <gene>_ambiguous_muts.csv
|
# 2) <gene>_common_ids.csv
|
||||||
# 2) <gene>_mcsm_snps.csv
|
# 3) <gene>_ambiguous_muts.csv
|
||||||
# 3) <gene>_metadata.csv
|
# 4) <gene>_mcsm_snps.csv
|
||||||
# 4) <gene>_all_muts_msa.csv
|
# 5) <gene>_metadata_poscounts.csv
|
||||||
# 5) <gene>_mutational_positons.csv
|
# 6) <gene>_metadata.csv
|
||||||
|
# 7) <gene>_all_muts_msa.csv
|
||||||
|
# 8) <gene>_mutational_positons.csv
|
||||||
|
|
||||||
# FIXME
|
#------------
|
||||||
## Make all cols lowercase
|
# NOTE
|
||||||
## change WildPos: wild_pos
|
#-----------
|
||||||
## Add an extra col: wild_chain_pos
|
# drtype is renamed to 'resistance' in the 35k dataset
|
||||||
## output df: <gene>_linking_df.csv
|
# all colnames in the ouput files lowercase
|
||||||
#containing the following cols
|
|
||||||
#1. Mutationinformation
|
|
||||||
#2. wild_type
|
|
||||||
#3. position
|
|
||||||
#4. mutant_type
|
|
||||||
#5. chain
|
|
||||||
#6. wild_pos
|
|
||||||
#7. wild_chain_pos
|
|
||||||
|
|
||||||
#-------------
|
#-------------
|
||||||
# requires
|
# requires
|
||||||
|
@ -66,9 +58,6 @@ os.getcwd()
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
os.getcwd()
|
os.getcwd()
|
||||||
|
|
||||||
# Requires
|
|
||||||
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
|
||||||
from tidy_split import tidy_split
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% command line args
|
#%% command line args
|
||||||
arg_parser = argparse.ArgumentParser()
|
arg_parser = argparse.ArgumentParser()
|
||||||
|
@ -77,6 +66,7 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
|
||||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||||
|
arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true')
|
||||||
|
|
||||||
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode')
|
||||||
|
|
||||||
|
@ -89,9 +79,10 @@ gene = args.gene
|
||||||
datadir = args.datadir
|
datadir = args.datadir
|
||||||
indir = args.input_dir
|
indir = args.input_dir
|
||||||
outdir = args.output_dir
|
outdir = args.output_dir
|
||||||
|
make_dirs = args.make_dirs
|
||||||
|
|
||||||
#drug = 'pyrazinamide'
|
#drug = 'ethambutol'
|
||||||
#gene = 'pncA'
|
#gene = 'embB'
|
||||||
|
|
||||||
#%% input and output dirs and files
|
#%% input and output dirs and files
|
||||||
#=======
|
#=======
|
||||||
|
@ -106,19 +97,46 @@ if not indir:
|
||||||
if not outdir:
|
if not outdir:
|
||||||
outdir = datadir + '/' + drug + '/output'
|
outdir = datadir + '/' + drug + '/output'
|
||||||
|
|
||||||
|
if make_dirs:
|
||||||
|
print('make_dirs is turned on, creating data dir:', datadir)
|
||||||
|
try:
|
||||||
|
os.makedirs(datadir, exist_ok = True)
|
||||||
|
print("Directory '%s' created successfully" %datadir)
|
||||||
|
except OSError as error:
|
||||||
|
print("Directory '%s' can not be created")
|
||||||
|
|
||||||
|
print('make_dirs is turned on, creating indir:', indir)
|
||||||
|
try:
|
||||||
|
os.makedirs(indir, exist_ok = True)
|
||||||
|
print("Directory '%s' created successfully" %indir)
|
||||||
|
except OSError as error:
|
||||||
|
print("Directory '%s' can not be created")
|
||||||
|
|
||||||
|
print('make_dirs is turned on, creating outdir:', outdir)
|
||||||
|
try:
|
||||||
|
os.makedirs(outdir, exist_ok = True)
|
||||||
|
print("Directory '%s' created successfully" %outdir)
|
||||||
|
except OSError as error:
|
||||||
|
print("Directory '%s' can not be created")
|
||||||
|
|
||||||
# handle missing dirs here
|
# handle missing dirs here
|
||||||
if not os.path.isdir(datadir):
|
if not os.path.isdir(datadir):
|
||||||
print('ERROR: Data directory does not exist:', datadir
|
print('ERROR: Data directory does not exist:', datadir
|
||||||
, '\nPlease create and ensure gwas data is present and then rerun')
|
, '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
if not os.path.isdir(indir):
|
if not os.path.isdir(indir):
|
||||||
print('ERROR: Input directory does not exist:', indir
|
print('ERROR: Input directory does not exist:', indir
|
||||||
, '\nPlease either create or specify indir and rerun')
|
, '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
if not os.path.isdir(outdir):
|
if not os.path.isdir(outdir):
|
||||||
print('ERROR: Output directory does not exist:', outdir
|
print('ERROR: Output directory does not exist:', outdir
|
||||||
, '\nPlease create or specify outdir and rerun')
|
, '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
# Requires
|
||||||
|
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
||||||
|
from tidy_split import tidy_split
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
gene_match = gene + '_p.'
|
gene_match = gene + '_p.'
|
||||||
print('mut pattern for gene', gene, ':', gene_match)
|
print('mut pattern for gene', gene, ':', gene_match)
|
||||||
|
@ -230,21 +248,17 @@ print('RESULT: Total samples:', total_samples
|
||||||
# counts NA per column
|
# counts NA per column
|
||||||
meta_data.isna().sum()
|
meta_data.isna().sum()
|
||||||
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
||||||
, '\n===========================================================')
|
, '\n===========================================================\n')
|
||||||
|
|
||||||
#%% Write check file
|
#%% Write check file
|
||||||
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
|
check_file = outdir + '/' + gene.lower() + '_gwas.csv'
|
||||||
meta_data.to_csv(check_file, index = False)
|
meta_data.to_csv(check_file, index = False)
|
||||||
print('Writing subsetted gwas data'
|
print('\n----------------------------------'
|
||||||
|
, '\nWriting subsetted gwas data:', gene
|
||||||
|
, '\n----------------------------------'
|
||||||
, '\nFile', check_file
|
, '\nFile', check_file
|
||||||
, '\nDim:', meta_data.shape)
|
, '\nDim:', meta_data.shape)
|
||||||
|
|
||||||
# glance
|
|
||||||
#meta_data.head()
|
|
||||||
#total_samples - NA pyrazinamide = ?
|
|
||||||
# 19K: 19265-6754 = 12511
|
|
||||||
# 33K: 33681 - 23823 = 9858
|
|
||||||
|
|
||||||
# equivalent of table in R
|
# equivalent of table in R
|
||||||
# drug counts: complete samples for OR calcs
|
# drug counts: complete samples for OR calcs
|
||||||
meta_data[drug].value_counts()
|
meta_data[drug].value_counts()
|
||||||
|
@ -309,7 +323,7 @@ print('=================================================================')
|
||||||
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
||||||
|
|
||||||
#========
|
#========
|
||||||
# Second: counting <gene_match> mutations in dr_muts_col column
|
# Second: counting <gene_match> mutations in other_muts_col column
|
||||||
#========
|
#========
|
||||||
print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)
|
print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col)
|
||||||
# drop na and extract a clean df
|
# drop na and extract a clean df
|
||||||
|
@ -398,7 +412,7 @@ meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_mat
|
||||||
print('gene_snp_match in dr:', len(meta_gene_dr))
|
print('gene_snp_match in dr:', len(meta_gene_dr))
|
||||||
|
|
||||||
dr_id = meta_gene_dr['id'].unique()
|
dr_id = meta_gene_dr['id'].unique()
|
||||||
print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
|
print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id))
|
||||||
|
|
||||||
if len(id_dr) == len(meta_gene_dr):
|
if len(id_dr) == len(meta_gene_dr):
|
||||||
print('PASS: lengths match'
|
print('PASS: lengths match'
|
||||||
|
@ -448,7 +462,7 @@ meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contai
|
||||||
print('gene_snp_match in other:', len(meta_gene_other))
|
print('gene_snp_match in other:', len(meta_gene_other))
|
||||||
|
|
||||||
other_id = meta_gene_other['id'].unique()
|
other_id = meta_gene_other['id'].unique()
|
||||||
print('RESULT: No. of samples with other muts:', len(other_id))
|
print('RESULT: No. of samples with other muts in', gene, ':', len(other_id))
|
||||||
|
|
||||||
if len(id_other) == len(meta_gene_other):
|
if len(id_other) == len(meta_gene_other):
|
||||||
print('PASS: lengths match'
|
print('PASS: lengths match'
|
||||||
|
@ -485,6 +499,7 @@ common_ids2.columns = ['index', 'id2']
|
||||||
# should be True
|
# should be True
|
||||||
if common_ids['id'].equals(common_ids2['id2']):
|
if common_ids['id'].equals(common_ids2['id2']):
|
||||||
print('PASS: Further cross checks on common ids')
|
print('PASS: Further cross checks on common ids')
|
||||||
|
nu_common_ids = common_ids.nunique()
|
||||||
else:
|
else:
|
||||||
sys.exit('FAIL: Further cross checks on common ids')
|
sys.exit('FAIL: Further cross checks on common ids')
|
||||||
|
|
||||||
|
@ -496,8 +511,9 @@ print('Expected no. of gene samples:', expected_gene_samples
|
||||||
#print(outdir)
|
#print(outdir)
|
||||||
out_filename_cid = gene.lower() + '_common_ids.csv'
|
out_filename_cid = gene.lower() + '_common_ids.csv'
|
||||||
outfile_cid = outdir + '/' + out_filename_cid
|
outfile_cid = outdir + '/' + out_filename_cid
|
||||||
|
print('\n----------------------------------'
|
||||||
print('Writing file:'
|
'\nWriting file: common_ids'
|
||||||
|
'\n----------------------------------'
|
||||||
, '\nFile:', outfile_cid
|
, '\nFile:', outfile_cid
|
||||||
, '\nNo. of rows:', len(common_ids)
|
, '\nNo. of rows:', len(common_ids)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
@ -517,7 +533,7 @@ print('Extracting nsSNP match:', gene, 'mutations from cols:\n'
|
||||||
meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
|
meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ]
|
||||||
|
|
||||||
extracted_gene_samples = meta_gene_all['id'].nunique()
|
extracted_gene_samples = meta_gene_all['id'].nunique()
|
||||||
print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
|
print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
# sanity check: length of gene samples
|
# sanity check: length of gene samples
|
||||||
|
@ -774,7 +790,6 @@ print('Length of dfs pre and post processing...'
|
||||||
, '\ngene LF data (unique mutation in each row):', len(gene_LF1)
|
, '\ngene LF data (unique mutation in each row):', len(gene_LF1)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
|
||||||
|
|
||||||
#%% sanity check for extraction
|
#%% sanity check for extraction
|
||||||
# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
|
# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows'
|
||||||
print('Verifying whether extraction process worked correctly...')
|
print('Verifying whether extraction process worked correctly...')
|
||||||
|
@ -865,7 +880,11 @@ else:
|
||||||
, '\n=========================================================')
|
, '\n=========================================================')
|
||||||
|
|
||||||
#%% clear variables
|
#%% clear variables
|
||||||
del(id_dr, id_other, meta_data, meta_gene_dr, meta_gene_other, mut_grouped, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)
|
del(id_dr, id_other
|
||||||
|
#, meta_data
|
||||||
|
, meta_gene_dr, meta_gene_other
|
||||||
|
#, mut_grouped
|
||||||
|
, muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na)
|
||||||
|
|
||||||
del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)
|
del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1)
|
||||||
|
|
||||||
|
@ -877,7 +896,9 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
|
||||||
|
|
||||||
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
||||||
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
||||||
print('Writing file: ambiguous muts'
|
print('\n----------------------------------'
|
||||||
|
, '\nWriting file: ambiguous muts'
|
||||||
|
, '\n----------------------------------'
|
||||||
, '\nFilename:', outfile_ambig_muts)
|
, '\nFilename:', outfile_ambig_muts)
|
||||||
|
|
||||||
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||||
|
@ -1163,7 +1184,9 @@ else:
|
||||||
out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
|
out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
|
||||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||||
|
|
||||||
print('Writing file: mCSM style muts'
|
print('\n----------------------------------'
|
||||||
|
, '\nWriting file: mCSM style muts'
|
||||||
|
, '\n----------------------------------'
|
||||||
, '\nFile:', outfile_mcsmsnps
|
, '\nFile:', outfile_mcsmsnps
|
||||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||||
, '\nNo. of distinct muts:', len(snps_only)
|
, '\nNo. of distinct muts:', len(snps_only)
|
||||||
|
@ -1191,7 +1214,9 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T
|
||||||
# where each row has UNIQUE mutations NOT unique sample ids
|
# where each row has UNIQUE mutations NOT unique sample ids
|
||||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||||
print('Writing file: Metadata poscounts'
|
print('\n----------------------------------'
|
||||||
|
, 'Writing file: Metadata poscounts'
|
||||||
|
, '\n----------------------------------'
|
||||||
, '\nFile:', outfile_metadata_poscounts
|
, '\nFile:', outfile_metadata_poscounts
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
|
||||||
|
@ -1205,9 +1230,12 @@ del(out_filename_metadata_poscounts)
|
||||||
|
|
||||||
#%% Write file: gene_metadata (i.e gene_LF1)
|
#%% Write file: gene_metadata (i.e gene_LF1)
|
||||||
# where each row has UNIQUE mutations NOT unique sample ids
|
# where each row has UNIQUE mutations NOT unique sample ids
|
||||||
out_filename_metadata = gene.lower() + '_metadata_raw.csv'
|
#out_filename_metadata = gene.lower() + '_metadata_raw.csv' #! rationale?
|
||||||
|
out_filename_metadata = gene.lower() + '_metadata.csv'
|
||||||
outfile_metadata = outdir + '/' + out_filename_metadata
|
outfile_metadata = outdir + '/' + out_filename_metadata
|
||||||
print('Writing file: LF formatted data'
|
print('\n----------------------------------'
|
||||||
|
, '\nWriting file: LF formatted data'
|
||||||
|
, '\n----------------------------------'
|
||||||
, '\nFile:', outfile_metadata
|
, '\nFile:', outfile_metadata
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
|
||||||
|
@ -1248,10 +1276,12 @@ else:
|
||||||
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
|
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
|
||||||
outfile_msa = outdir + '/' + out_filename_msa
|
outfile_msa = outdir + '/' + out_filename_msa
|
||||||
|
|
||||||
print('Writing file: mCSM style muts for msa',
|
print('\n----------------------------------'
|
||||||
'\nFile:', outfile_msa,
|
, '\nWriting file: mCSM style muts for msa'
|
||||||
'\nmutation format (SNP): {WT}<POS>{MUT}',
|
, '\n----------------------------------'
|
||||||
'\nNo.of lines of msa:', len(all_muts_msa))
|
, '\nFile:', outfile_msa
|
||||||
|
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||||
|
, '\nNo.of lines of msa:', len(all_muts_msa))
|
||||||
|
|
||||||
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
|
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
|
||||||
|
|
||||||
|
@ -1278,7 +1308,9 @@ pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
||||||
out_filename_pos = gene.lower() + '_mutational_positons.csv'
|
out_filename_pos = gene.lower() + '_mutational_positons.csv'
|
||||||
outfile_pos = outdir + '/' + out_filename_pos
|
outfile_pos = outdir + '/' + out_filename_pos
|
||||||
|
|
||||||
print('Writing file: mutational positions'
|
print('\n----------------------------------'
|
||||||
|
, 'Writing file: mutational positions'
|
||||||
|
, '\n----------------------------------'
|
||||||
, '\nFile:', outfile_pos
|
, '\nFile:', outfile_pos
|
||||||
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
@ -1293,15 +1325,39 @@ print('Finished writing:', outfile_pos
|
||||||
|
|
||||||
del(out_filename_pos)
|
del(out_filename_pos)
|
||||||
#%% quick summary output
|
#%% quick summary output
|
||||||
|
#print('============================================'
|
||||||
|
# , '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||||
|
# , '\n============================================'
|
||||||
|
# , '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
|
||||||
|
# , '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
|
||||||
|
# , '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
||||||
|
# , '\n============================================================='
|
||||||
|
# , '\n\n\n')
|
||||||
|
|
||||||
print('============================================'
|
print('============================================'
|
||||||
, '\nQuick summary output for', gene.lower()
|
, '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||||
, '\n============================================'
|
, '\n============================================'
|
||||||
|
, '\nTotal samples:', total_samples
|
||||||
|
, '\nNo. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts()
|
||||||
|
, '\n'
|
||||||
|
, '\nPercentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||||
|
, '\n'
|
||||||
|
, '\nTotal no.of unique dr muts:', mut_grouped[dr_muts_col]
|
||||||
|
, '\nTotal no.of unique other muts:', mut_grouped[other_muts_col]
|
||||||
, '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
|
, '\nTotal no.of unique missense muts:', gene_LF1['mutationinformation'].nunique()
|
||||||
, '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
|
, '\nTotal no.of unique positions associated with missense muts:',gene_LF1['position'].nunique()
|
||||||
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
, '\nTotal no. of samples with missense muts:', len(gene_LF1)
|
||||||
|
, '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()
|
||||||
|
, '\n'
|
||||||
|
, '\nTotal no.of samples with common_ids:', nu_common_ids['id']
|
||||||
|
, '\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||||
|
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
||||||
|
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
||||||
, '\n============================================================='
|
, '\n============================================================='
|
||||||
, '\n\n\n')
|
, '\n\n\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
print(u'\u2698' * 50,
|
print(u'\u2698' * 50,
|
||||||
'\nEnd of script: Data extraction and writing files'
|
'\nEnd of script: Data extraction and writing files'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue