updated pnca_extraction and AF_OR calcs
This commit is contained in:
parent
eb021349fe
commit
b331227023
4 changed files with 195 additions and 699 deletions
|
@ -36,9 +36,10 @@ import numpy as np
|
|||
# 1) pnca_ambiguous_muts.csv
|
||||
# 2) pnca_mcsm_snps.csv
|
||||
# 3) pnca_metadata.csv
|
||||
# 4) pnca_comp_snps.csv
|
||||
# 5) pnca_all_muts_msa.csv
|
||||
# 6) pnca_mutational_positons.csv
|
||||
# 4) pnca_comp_snps.csv <---deleted>
|
||||
|
||||
# 4) pnca_all_muts_msa.csv
|
||||
# 5) pnca_mutational_positons.csv
|
||||
#========================================================
|
||||
#%% specify homedir as python doesn't recognise tilde
|
||||
homedir = os.path.expanduser('~')
|
||||
|
@ -52,23 +53,25 @@ os.getcwd()
|
|||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||
#========================================================
|
||||
|
||||
#drug = 'pyrazinamide'
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'pncA'
|
||||
gene_match = gene + '_p.'
|
||||
|
||||
#%% specify variables for input and output paths and filenames
|
||||
|
||||
#=======
|
||||
# input dir
|
||||
#=======
|
||||
indir = 'git/Data/pyrazinamide/input/original'
|
||||
|
||||
#indir = 'git/Data/pyrazinamide/input/original'
|
||||
indir = 'git/Data' + '/' + drug + '/' + 'input/original'
|
||||
#=========
|
||||
# output dir
|
||||
#=========
|
||||
# several output files
|
||||
# output filenames in respective sections at the time of outputting files
|
||||
outdir = 'git/Data/pyrazinamide/output'
|
||||
#outdir = 'git/Data/pyrazinamide/output'
|
||||
outdir = 'git/Data' + '/' + drug + '/' + 'output'
|
||||
|
||||
#%%end of variable assignment for input and output files
|
||||
#==============================================================================
|
||||
#%% Read files
|
||||
|
@ -77,7 +80,7 @@ in_filename = 'original_tanushree_data_v2.csv'
|
|||
infile = homedir + '/' + indir + '/' + in_filename
|
||||
print('Reading input master file:', infile)
|
||||
|
||||
master_data = pd.read_csv(infile, sep = ',')
|
||||
master_data = pd.read_csv(infile, sep = ',')
|
||||
|
||||
# column names
|
||||
#list(master_data.columns)
|
||||
|
@ -334,6 +337,8 @@ print('Writing file: common ids:\n',
|
|||
|
||||
common_ids.to_csv(outfile0)
|
||||
print('======================================================================')
|
||||
del(out_filename0)
|
||||
|
||||
|
||||
# clear variables
|
||||
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
|
||||
|
@ -701,21 +706,6 @@ del(c1, c2, col_to_split1, col_to_split2, comp_pnca_samples, dr_WF0, dr_df, dr_m
|
|||
|
||||
#%% end of data extraction and some files writing. Below are some more files writing.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#%%: write file: ambiguous muts
|
||||
# uncomment as necessary
|
||||
#print(outdir)
|
||||
|
@ -735,6 +725,8 @@ inspect.to_csv(outfile1)
|
|||
print('Finished writing:', out_filename1, '\nExpected no. of rows (no. of samples with the ambiguous muts present):', dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum())
|
||||
print('======================================================================')
|
||||
del(out_filename1)
|
||||
|
||||
|
||||
#%%
|
||||
#===========
|
||||
# Split 'mutation' column into three: wild_type, position and
|
||||
|
@ -891,6 +883,8 @@ print('Finished writing:', out_filename2,
|
|||
'\nNo. of rows:', len(snps_only) )
|
||||
print('======================================================================')
|
||||
del(out_filename2)
|
||||
|
||||
|
||||
#%% Write file: pnca_metadata (i.e pnca_LF1)
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename3 = gene.lower() + '_' + 'metadata.csv'
|
||||
|
@ -903,45 +897,10 @@ pnca_LF1.to_csv(outfile3, header = True, index = False)
|
|||
print('Finished writing:', out_filename3,
|
||||
'\nNo. of rows:', len(pnca_LF1),
|
||||
'\nNo. of cols:', len(pnca_LF1.columns) )
|
||||
|
||||
print('======================================================================')
|
||||
del(out_filename3)
|
||||
|
||||
#%% Write file: comp SNPs (i.e snps without any corresponding 'NA' in the <drug>
|
||||
# column to allow OR calcs)
|
||||
|
||||
# remove NA from pyrazinamide cols
|
||||
pnca_LF2 = pnca_LF1.dropna(subset=['pyrazinamide'])
|
||||
|
||||
print('extracting OR muts by removing NAs from pyrazinamide cols')
|
||||
if pnca_LF2.pyrazinamide.isna().sum() > 0:
|
||||
print('FAIL: NAs NOT removed successfully')
|
||||
else:
|
||||
print('PASS: NAs removed successfully')
|
||||
|
||||
# extracting comp snps only
|
||||
comp_snps_only = pd.DataFrame(pnca_LF2['mutation'].unique())
|
||||
#print('Total no. of comp snps:', len(comp_snps_only))
|
||||
comp_snps_only.head()
|
||||
|
||||
# assign column name
|
||||
comp_snps_only.columns = ['mutation']
|
||||
|
||||
# count how many positions this corresponds to
|
||||
comp_pos_only = pd.DataFrame(pnca_LF2['position'].unique())
|
||||
#print('Total no. of pos corresponding to comp_snps:', len(comp_pos_only))
|
||||
|
||||
out_filename4 = gene.lower() + '_' + 'comp_snps.csv'
|
||||
outfile4 = homedir + '/' + outdir + '/' + out_filename4
|
||||
print('Writing file: comp snps to allow OR calcs',
|
||||
'\nFilename:', out_filename4,
|
||||
'\nPath:', homedir + '/' + outdir,
|
||||
'\nNo. of comp muts:', len(comp_snps_only),
|
||||
'\nNo. of distinct positions for comp muts:', len(comp_pos_only) )
|
||||
|
||||
comp_snps_only.to_csv(outfile4, header = True, index = False)
|
||||
|
||||
print('Finished writing:', out_filename4,
|
||||
'\nNo. of rows:', len(comp_snps_only) )
|
||||
#%% write file: mCSM style but with repitions for MSA and logo plots
|
||||
all_muts_msa = pd.DataFrame(pnca_LF1['Mutationinformation'])
|
||||
all_muts_msa.head()
|
||||
|
@ -970,21 +929,22 @@ else:
|
|||
'\nDebug please!')
|
||||
print('======================================================================')
|
||||
|
||||
out_filename5 = gene.lower() + '_' + 'all_muts_msa.csv'
|
||||
outfile5 = homedir + '/' + outdir + '/' + out_filename5
|
||||
out_filename4 = gene.lower() + '_' + 'all_muts_msa.csv'
|
||||
outfile4 = homedir + '/' + outdir + '/' + out_filename4
|
||||
|
||||
print('Writing file: mCSM style muts for msa',
|
||||
'\nmutation format (SNP): {Wt}<POS>{Mut}',
|
||||
'\nNo.of lines of msa:', len(all_muts_msa),
|
||||
'\nFilename:', out_filename5,
|
||||
'\nFilename:', out_filename4,
|
||||
'\nPath:', homedir +'/'+ outdir)
|
||||
|
||||
all_muts_msa_sorted.to_csv(outfile5, header = False, index = False)
|
||||
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
|
||||
|
||||
print('Finished writing:', out_filename5,
|
||||
print('Finished writing:', out_filename4,
|
||||
'\nNo. of rows:', len(all_muts_msa) )
|
||||
print('======================================================================')
|
||||
del(out_filename5)
|
||||
del(out_filename4)
|
||||
|
||||
|
||||
#%% write file for mutational positions
|
||||
# count how many positions this corresponds to
|
||||
|
@ -999,20 +959,22 @@ pos_only.position.dtype
|
|||
# sort by position value
|
||||
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
||||
|
||||
out_filename6 = gene.lower() + '_' + 'mutational_positons.csv'
|
||||
outfile6 = homedir + '/' + outdir + '/' + out_filename6
|
||||
out_filename5 = gene.lower() + '_' + 'mutational_positons.csv'
|
||||
outfile5 = homedir + '/' + outdir + '/' + out_filename5
|
||||
|
||||
print('Writing file: mutational positions',
|
||||
'\nNo. of distinct positions:', len(pos_only_sorted),
|
||||
'\nFilename:', out_filename6,
|
||||
'\nFilename:', out_filename5,
|
||||
'\nPath:', homedir +'/'+ outdir)
|
||||
|
||||
pos_only_sorted.to_csv(outfile6, header = True, index = False)
|
||||
pos_only_sorted.to_csv(outfile5, header = True, index = False)
|
||||
|
||||
print('Finished writing:', out_filename6,
|
||||
print('Finished writing:', out_filename5,
|
||||
'\nNo. of rows:', len(pos_only_sorted) )
|
||||
print('======================================================================')
|
||||
del(out_filename6)
|
||||
del(out_filename5)
|
||||
|
||||
|
||||
#%% end of script
|
||||
print('======================================================================')
|
||||
print(u'\u2698' * 50,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue