done but getting an error when running from cmd
This commit is contained in:
parent
29e9d10e39
commit
ac0d14e116
1 changed files with 521 additions and 195 deletions
716
scripts/data_extraction.py
Normal file → Executable file
716
scripts/data_extraction.py
Normal file → Executable file
|
@ -314,7 +314,7 @@ print('===========================================================\n'
|
|||
#meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
|
||||
# so downstream code doesn't change
|
||||
meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
|
||||
#%% DF: with dr_muts_col
|
||||
#%% DF: with dr_muts_col, meta_gene_dr
|
||||
meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
|
||||
meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
|
||||
|
||||
|
@ -327,7 +327,7 @@ else:
|
|||
, '\nshape of df2:', meta_gene_dr1.shape
|
||||
, '\nCheck again!')
|
||||
##############################################################################
|
||||
#%% DF: with other_muts_col
|
||||
#%% DF: with other_muts_col, other_gene_dr
|
||||
meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
|
||||
meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
|
||||
|
||||
|
@ -365,11 +365,11 @@ print('===========================================================\n'
|
|||
, '\n===========================================================')
|
||||
###############################################################################
|
||||
#%% Create a copy of indices for downstream mergeing
|
||||
meta_gene_all['index_orig'] = meta_gene_all.index
|
||||
meta_gene_all['index_orig_copy'] = meta_gene_all.index
|
||||
#meta_gene_all['index_orig'] = meta_gene_all.index
|
||||
#meta_gene_all['index_orig_copy'] = meta_gene_all.index
|
||||
|
||||
all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
|
||||
all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
|
||||
#all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
|
||||
#all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
|
||||
##############################################################################
|
||||
#%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc.
|
||||
# Split based on semi colon dr_muts_col
|
||||
|
@ -456,13 +456,13 @@ del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
|||
# DF1: dr_muts_col
|
||||
# and remove leading white spaces
|
||||
#=========
|
||||
col_to_split1 = dr_muts_col
|
||||
#col_to_split1 = dr_muts_col
|
||||
print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
|
||||
, '\ncolumn name to apply tidy_split():' , col_to_split1
|
||||
, '\ncolumn name to apply tidy_split():' , dr_muts_col
|
||||
, '\n============================================================')
|
||||
|
||||
# apply tidy_split()
|
||||
dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';')
|
||||
dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';')
|
||||
# remove leading white space else these are counted as distinct mutations as well
|
||||
dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip()
|
||||
|
||||
|
@ -579,13 +579,13 @@ del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
|
|||
# DF2: other_muts_col
|
||||
# and remove leading white spaces
|
||||
#=========
|
||||
col_to_split2 = other_muts_col
|
||||
#col_to_split2 = other_muts_col
|
||||
print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
|
||||
, '\ncolumn name to apply tidy_split():', col_to_split2
|
||||
, '\ncolumn name to apply tidy_split():', other_muts_col
|
||||
, '\n============================================================')
|
||||
|
||||
# apply tidy_split()
|
||||
other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';')
|
||||
other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';')
|
||||
# remove the leading white spaces in the column
|
||||
other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip()
|
||||
|
||||
|
@ -644,7 +644,7 @@ print('\n==================================================================='
|
|||
, '\nPredicting total no. of rows in the curated df:', expected_rows
|
||||
, '\n===================================================================')
|
||||
|
||||
#%%another way: Add value checks for dict so you can know if its correct for LF data count below
|
||||
#%% another way: Add value checks for dict so you can know if its correct for LF data count below
|
||||
dr_snps_vc_dict = pd.Series(dr_gene_mutsL).value_counts().to_dict()
|
||||
other_snps_vc_dict = pd.Series(other_gene_mutsL).value_counts().to_dict()
|
||||
|
||||
|
@ -683,7 +683,7 @@ SnpFDict = lower_dict(SnpFDict_orig)
|
|||
###############################################################################
|
||||
# USE Vcounts to get expected curated df
|
||||
# resolve dm om muts funda
|
||||
#%%#%% Concatenating two dfs: gene_LF0
|
||||
#%% Concatenating two dfs: gene_LF0
|
||||
# equivalent of rbind in R
|
||||
#==========
|
||||
# Concatentating the two dfs: equivalent of rbind in R
|
||||
|
@ -922,13 +922,20 @@ ambig_muts_rev_df = pd.DataFrame()
|
|||
changes_val = []
|
||||
changes_dict = {}
|
||||
|
||||
for i in common_muts:
|
||||
#print(i)
|
||||
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
|
||||
##BROKENNNN!!!!
|
||||
|
||||
common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
|
||||
common_muts_lower
|
||||
##BROKENNNN!!!!
|
||||
#for i in common_muts:
|
||||
for i in common_muts_lower:
|
||||
|
||||
print(i)
|
||||
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
|
||||
temp_df
|
||||
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
|
||||
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
|
||||
|
||||
max_info_table
|
||||
revised_label = max_info_table[[0]].index[0][1] # max value_count
|
||||
old_label = max_info_table[[1]].index[0][1] # min value_count
|
||||
print('\nAmbiguous mutation labels...'
|
||||
|
@ -948,12 +955,12 @@ ambig_muts_rev_df['mutation_info_orig'].value_counts()
|
|||
changes_val
|
||||
changes_total = sum(changes_val)
|
||||
changes_dict
|
||||
#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts
|
||||
#===================
|
||||
#%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
|
||||
#==================
|
||||
# ambiguous muts
|
||||
#=======================
|
||||
#dr_muts.to_csv('dr_muts.csv', header = True)
|
||||
#other_muts.to_csv('other_muts.csv', header = True)
|
||||
#==================
|
||||
#dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
|
||||
#other_muts.XXXX_csvXXX('other_muts.csv', header = True)
|
||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
|
||||
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
|
||||
|
@ -972,19 +979,20 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
|||
, '\n=============================================================')
|
||||
del(out_filename_ambig_muts)
|
||||
|
||||
#%% FIXME: ambig mut counts
|
||||
#=======================
|
||||
#%% OUTFILE 2, write file: ambiguous mut counts
|
||||
#======================
|
||||
# ambiguous mut counts
|
||||
#=======================
|
||||
#======================
|
||||
out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
|
||||
outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: ambiguous muts'
|
||||
, '\nWriting file: ambiguous mut counts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFilename:', outfile_ambig_mut_counts)
|
||||
|
||||
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
|
||||
#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks
|
||||
#%% FIXME: Add sanity check to make sure you can add value_count checks
|
||||
print('\nREACHED here...................>>>>>>>')
|
||||
#%% Resolving ambiguous muts
|
||||
# Merging ambiguous muts
|
||||
#=================
|
||||
|
@ -995,6 +1003,12 @@ ambig_muts_rev_df.index
|
|||
gene_LF1.index
|
||||
all(ambig_muts_rev_df.index.isin(gene_LF1.index))
|
||||
|
||||
any(gene_LF1.index.isin(ambig_muts_rev_df.index))
|
||||
if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
|
||||
print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
|
||||
else:
|
||||
sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
|
||||
|
||||
#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
|
||||
|
||||
|
@ -1024,9 +1038,9 @@ gene_LF1['mutation_info'].value_counts()
|
|||
#%% PHEW! Good to go for downstream stuff
|
||||
#%% Add column: Mutationinformation
|
||||
# splitting mutation column to get mCSM style muts
|
||||
#=======================
|
||||
#=====================================================
|
||||
# Formatting df: read aa dict and pull relevant info
|
||||
#=======================
|
||||
#=====================================================
|
||||
print('Now some more formatting:'
|
||||
, '\nRead aa dict and pull relevant info'
|
||||
, '\nFormat mutations:'
|
||||
|
@ -1040,12 +1054,13 @@ print('Now some more formatting:'
|
|||
ncol_mutf_add = 3 # mut split into 3 cols
|
||||
ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
|
||||
|
||||
#===========
|
||||
#======================================================================
|
||||
# Split 'mutation' column into three: wild_type, position and
|
||||
# mutant_type separately. Then map three letter code to one using
|
||||
# reference_dict imported at the beginning.
|
||||
# After importing, convert to mutation to lowercase for compatibility with dict
|
||||
#===========
|
||||
# After importing, convert to mutation to lowercase for
|
||||
# compatibility with dict
|
||||
#=======================================================================
|
||||
gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
|
||||
|
||||
print('wt regex being used:', wt_regex
|
||||
|
@ -1054,14 +1069,15 @@ print('wt regex being used:', wt_regex
|
|||
|
||||
mylen0 = len(gene_LF1.columns)
|
||||
|
||||
#=======
|
||||
#=========================================================
|
||||
# Iterate through the dict, create a lookup dict i.e
|
||||
# lookup_dict = {three_letter_code: one_letter_code}.
|
||||
# lookup dict should be the key and the value (you want to create a column for)
|
||||
# Then use this to perform the mapping separetly for wild type and mutant cols.
|
||||
# The three letter code is extracted using a string match match from the dataframe and then converted
|
||||
# The three letter code is extracted using a string match match from the
|
||||
# dataframe and then converted
|
||||
# to 'pandas series'since map only works in pandas series
|
||||
#=======
|
||||
#===========================================================
|
||||
print('Adding', ncol_mutf_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to 1-letter code
|
||||
|
@ -1076,7 +1092,11 @@ for k, v in my_aa_dict.items():
|
|||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mutant_type'] = mut.map(lookup_dict)
|
||||
|
||||
# extract position info from mutation column separetly using string match
|
||||
#-------------------
|
||||
# extract position
|
||||
# info from mutation
|
||||
# column separetly using string match
|
||||
#-------------------
|
||||
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
|
||||
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
|
||||
|
||||
|
@ -1109,11 +1129,11 @@ else:
|
|||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
########
|
||||
##########################################################################
|
||||
# combine the wild_type+poistion+mutant_type columns to generate
|
||||
# mutationinformation (matches mCSM output field)
|
||||
# Remember to use .map(str) for int col types to allow string concatenation
|
||||
#########
|
||||
###########################################################################
|
||||
gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
|
||||
print('Created column: mutationinformation'
|
||||
, '\n=====================================================================\n'
|
||||
|
@ -1138,7 +1158,6 @@ bar = gene_LF1['position'].value_counts()
|
|||
#else:
|
||||
# print('FAIL: df could not be ordered. Check source')
|
||||
# sys.exit()
|
||||
|
||||
#%% Create a copy of mutationinformation column for downstream mergeing
|
||||
gene_LF1['Mut'] = gene_LF1['mutationinformation']
|
||||
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
|
||||
|
@ -1150,7 +1169,7 @@ gene_LF1['index_orig_copy'] = gene_LF1.index
|
|||
all(gene_LF1.index.values == gene_LF1['index_orig'].values)
|
||||
all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
|
||||
|
||||
#%% quick sanity check for position freq count
|
||||
#%% Quick sanity check for position freq count
|
||||
# count the freq of 'other muts' samples
|
||||
test_df = gene_LF1.copy()
|
||||
test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
|
||||
|
@ -1158,6 +1177,7 @@ test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'positi
|
|||
#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
|
||||
#print('Revised dim of other_muts_df:',test_df.shape)
|
||||
test_df['scount'] = test_df['mutation'].map(SnpFDict)
|
||||
test_df = test_df.sort_values(['position', 'mutationinformation'])
|
||||
#%% Map mutation frequency count as a column
|
||||
gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
|
||||
#%% Map position frequency count as a column
|
||||
|
@ -1166,75 +1186,151 @@ z1 = z.to_dict()
|
|||
gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
|
||||
|
||||
#test_df2 = test_df.loc[test_df['position'] == 10]
|
||||
|
||||
#%% OUTFILE 4, write file mCSM style muts
|
||||
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
|
||||
snps_only.head()
|
||||
# assign column name
|
||||
snps_only.columns = ['mutationinformation']
|
||||
|
||||
# count how many positions this corresponds to
|
||||
pos_only = pd.DataFrame(gene_LF1['position'].unique())
|
||||
pos_only
|
||||
|
||||
print('Checking NA in snps...')# should be 0
|
||||
if snps_only.mutationinformation.isna().sum() == 0:
|
||||
print ('PASS: NO NAs/missing entries for SNPs'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||
|
||||
# write file: mCSM muts
|
||||
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_mcsmsnps
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo. of distinct muts:', len(snps_only)
|
||||
, '\nNo. of distinct positions:', len(pos_only)
|
||||
, '\n=============================================================')
|
||||
|
||||
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
|
||||
|
||||
print('Finished writing:', outfile_mcsmsnps
|
||||
, '\nNo. of rows:', len(snps_only)
|
||||
, '\nNo. of cols:', len(snps_only.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_mcsmsnps)
|
||||
|
||||
#%% OUTFILE 5, write file frequency of position counts: MOVE TO THE END
|
||||
metadata_pos = pd.DataFrame(gene_LF1['position'])
|
||||
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
|
||||
metadata_pos['meta_pos_count'].value_counts()
|
||||
|
||||
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
|
||||
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: Metadata poscounts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata_poscounts
|
||||
, '\n============================================================')
|
||||
|
||||
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata_poscounts
|
||||
, '\nNo. of rows:', len(metadata_pos)
|
||||
, '\nNo. of cols:', len(metadata_pos.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata_poscounts)
|
||||
#%% OUTFILE 6, write file <gene>_metadata: MOVE TO THE END
|
||||
# ---THINK ---
|
||||
#%% OUTFILE 7, write file MSA plots: MOVE TO THE END
|
||||
# -- THINK ---
|
||||
#%% OUTFILE 8, write file mutational position with count: MOVE TO THE END
|
||||
# -- THINK ---
|
||||
###############################################################################
|
||||
###############################################################################
|
||||
#%% Add column: aa property_water
|
||||
#=========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_water}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
print('Adding', ncol_aa_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
# adding two more cols
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_water']
|
||||
#print(lookup_dict)
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wt_prop_water'] = wt.map(lookup_dict)
|
||||
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
|
||||
|
||||
mylen2 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen2 == mylen1 + ncol_aa_add:
|
||||
print('PASS: successfully added', ncol_aa_add, 'cols'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#%% Add column: aa_prop_polarity
|
||||
#========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
print('Adding', ncol_aa_add, 'more cols:\n')
|
||||
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
# adding two more cols
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_polarity']
|
||||
#print(lookup_dict)
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
|
||||
#mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
|
||||
|
||||
mylen3 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen3 == mylen2 + ncol_aa_add:
|
||||
print('PASS: successfully added', ncol_aa_add, 'cols'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen1
|
||||
, '\nnew len:', mylen2)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
#%% Add column: aa_calcprop
|
||||
#========
|
||||
# iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_calcprop}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
print('Adding', ncol_aa_add, 'more cols:\n')
|
||||
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_calcprop']
|
||||
#print(lookup_dict)
|
||||
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
|
||||
gene_LF1['wt_calcprop'] = wt.map(lookup_dict)
|
||||
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
|
||||
gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
|
||||
|
||||
mylen4 = len(gene_LF1.columns)
|
||||
|
||||
# sanity checks
|
||||
print('checking if 3-letter wt&mut residue extraction worked correctly')
|
||||
if wt.isna().sum() & mut.isna().sum() == 0:
|
||||
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
else:
|
||||
print('FAIL: 3-letter wt&mut residue extraction failed'
|
||||
, '\nNo NAs detected:'
|
||||
, '\nwild-type\n', wt
|
||||
, '\nmutant-type\n', mut
|
||||
, '\ndim of df:', gene_LF1.shape)
|
||||
|
||||
if mylen4 == mylen3 + ncol_aa_add:
|
||||
print('PASS: successfully added', ncol_aa_add, 'cols'
|
||||
, '\nold length:', mylen3
|
||||
, '\nnew len:', mylen4)
|
||||
else:
|
||||
print('FAIL: failed to add cols:'
|
||||
, '\nold length:', mylen3
|
||||
, '\nnew len:', mylen4)
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#%% NEW mappings: gene_LF2
|
||||
# gene_LF2: copy gene_LF1
|
||||
gene_LF2 = gene_LF1.copy()
|
||||
|
@ -1267,6 +1363,8 @@ gene_LF2['total_id_ucount'] = total_id_ucount
|
|||
# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
|
||||
# gene_LF2['mut_id_ucount']
|
||||
|
||||
gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
|
||||
|
||||
# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
|
||||
|
||||
#%% AF for gene
|
||||
|
@ -1275,7 +1373,7 @@ gene_LF2['total_id_ucount'] = total_id_ucount
|
|||
#=================
|
||||
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
|
||||
gene_LF2['maf'].head()
|
||||
|
||||
###############################################################################
|
||||
#%% Further mappings: gene_LF3, with mutationinformation as INDEX
|
||||
gene_LF3 = gene_LF2.copy()
|
||||
|
||||
|
@ -1324,6 +1422,7 @@ gene_LF3['drtype_all_vals'].head()
|
|||
|
||||
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
|
||||
gene_LF3['drtype_all_names'].head()
|
||||
|
||||
#---------------------------------
|
||||
# Revised drtype: max(Multimode)
|
||||
#--------------------------------
|
||||
|
@ -1406,6 +1505,7 @@ gene_LF3['dst_multimode_all']
|
|||
# Fill using multimode ONLY where NA in dst_multimode column
|
||||
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
|
||||
gene_LF3['dst_multimode']
|
||||
gene_LF3['dst_multimode'].value_counts()
|
||||
|
||||
#----------------------------------
|
||||
# Revised dst column: max of mode
|
||||
|
@ -1422,6 +1522,11 @@ gene_LF3[drug].value_counts()
|
|||
#gene_LF3['dst_noNA'].value_counts()
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
|
||||
if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]:
|
||||
print('\nPASS: revised dst mode created successfully and the counts are more than <drug> col count')
|
||||
else:
|
||||
print('\nFAIL: revised dst mode numbers MISmatch')
|
||||
|
||||
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
|
||||
foo2 = foo.sort_values(['position', 'Mut'])
|
||||
|
||||
|
@ -1460,6 +1565,7 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
|
|||
# Now overwrite
|
||||
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
|
||||
|
||||
if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
|
||||
print('\nPASS: Revised mutation_info column created successfully')
|
||||
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
|
||||
|
@ -1480,6 +1586,7 @@ gene_LF3['mutation_info_labels_orig'].value_counts()
|
|||
# %% sanity check for the revised dst
|
||||
gene_LF3[drug].value_counts()
|
||||
gene_LF3[drug].value_counts().sum()
|
||||
|
||||
gene_LF3['mutation_info_labels_orig'].value_counts()
|
||||
|
||||
gene_LF3['dst_mode'].value_counts()
|
||||
|
@ -1488,6 +1595,8 @@ gene_LF3['dst_mode'].value_counts().sum()
|
|||
# direct comparision
|
||||
gene_LF3['dst'].value_counts()
|
||||
gene_LF3['mutation_info_labels'].value_counts()
|
||||
gene_LF3['mutation_info_labels_v1'].value_counts()
|
||||
|
||||
#%% Lineage
|
||||
gene_LF3['lineage'].value_counts()
|
||||
# lineage_label_numeric = {'lineage1' : 1
|
||||
|
@ -1509,24 +1618,52 @@ lineage_label_numeric = {'L1' : 1
|
|||
, 'LBOV' : 8}
|
||||
|
||||
lineage_label_numeric
|
||||
# copy column to allow cross checks after stripping white space
|
||||
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip()
|
||||
gene_LF3['lineage_corrupt'] = gene_LF3['lineage']
|
||||
#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts())
|
||||
gene_LF3['lineage_corrupt'].value_counts()
|
||||
#%% Lineage cols selected: gene_LF3_ColsSel
|
||||
gene_LF3_ColsSel = gene_LF3.copy()
|
||||
gene_LF3_ColsSel.index
|
||||
gene_LF3_ColsSel.columns
|
||||
#gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete
|
||||
|
||||
#%%tidy_split(): Lineage
|
||||
# copy column to allow cross checks after stripping white space
|
||||
gene_LF3_ColsSel['lineage'] = gene_LF3_ColsSel['lineage'].str.strip()
|
||||
gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage']
|
||||
all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts())
|
||||
gene_LF3_ColsSel['lineage_corrupt'].value_counts()
|
||||
|
||||
gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']]
|
||||
gene_LF3_ColsSel.columns
|
||||
|
||||
#%% tidy_split(): Lineage
|
||||
# Create df with tidy_split: lineage
|
||||
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';')
|
||||
lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';')
|
||||
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
|
||||
lf_lin_split['lineage_corrupt'].value_counts()
|
||||
#%% Important sanity check for tidy_split(): lineage
|
||||
# Split based on semi colon dr_muts_col
|
||||
search = ";"
|
||||
|
||||
# Map lineage labels to numbers to allow metrics
|
||||
# count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
|
||||
count_df_lin = gene_LF3_ColsSel[['lineage']]
|
||||
count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
|
||||
lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
|
||||
lin_sc_C
|
||||
|
||||
lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count']
|
||||
lin_sc_C
|
||||
expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum()
|
||||
expected_linC
|
||||
|
||||
if (len(lf_lin_split) == expected_linC):
|
||||
print('\nPASS: tidy_split() length match for lineage')
|
||||
else:
|
||||
sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count')
|
||||
###############################################################################
|
||||
#%% Map lineage labels to numbers to allow metrics
|
||||
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
|
||||
lf_lin_split['lineage_numeric'].value_counts()
|
||||
|
||||
#--------------------------------
|
||||
# Lineage_corrupt ALL values:
|
||||
# Add lineage_list: ALL values:
|
||||
#--------------------------------
|
||||
# Add all lineages for each mutation
|
||||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
|
||||
|
@ -1535,12 +1672,21 @@ lf_lin_split['lineage_corrupt_list'].value_counts()
|
|||
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
|
||||
lf_lin_split['lineage_corrupt_list'].value_counts()
|
||||
|
||||
#--------------------------------
|
||||
# Add lineage count: ALL
|
||||
#--------------------------------
|
||||
lf_lin_split['lineage_corrupt_count'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : len(list(x)))
|
||||
|
||||
#--------------------------------
|
||||
# Add lineage unique count
|
||||
#--------------------------------
|
||||
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
|
||||
#lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
|
||||
lf_lin_split['lineage_corrupt_ucount'].value_counts()
|
||||
|
||||
#--------------------------------
|
||||
# Add lineage_set
|
||||
#--------------------------------
|
||||
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
|
||||
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
|
||||
|
||||
|
@ -1551,95 +1697,275 @@ lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')[
|
|||
lf_lin_split['lineage_multimode'].value_counts()
|
||||
|
||||
# cant take max as it doesn't mean anyting!
|
||||
foo = lf_lin_split[['Mut', 'lineage_ulist']]
|
||||
|
||||
###############################################################################
|
||||
#%% Final merge: gene_LF4 with lineage_split_df
|
||||
gene_LF4 = gene_LF3.copy()
|
||||
gene_LF4.index
|
||||
gene_LF4['index_orig_copy'] = gene_LF4['index_orig']
|
||||
#%% Select only the columns you want to merge from lf_lin_split
|
||||
lf_lin_split.columns
|
||||
lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
|
||||
, 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']]
|
||||
lf_lin_split_ColSel.columns
|
||||
|
||||
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig']
|
||||
#================================
|
||||
# Merge with gene_LF3 with
|
||||
# lf_lin_split baseed on index
|
||||
#================================
|
||||
# set index for lf_lin_split
|
||||
lf_lin_split.index
|
||||
lf_lin_split.reset_index(inplace=True)
|
||||
lf_lin_split['mutationinformation']
|
||||
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
|
||||
lf_lin_split.index
|
||||
lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all'
|
||||
, 'lineage_corrupt_count' : 'lineage_count_all'
|
||||
, 'lineage_corrupt_ucount' : 'lineage_count_unique'
|
||||
, 'lineage_ulist' : 'lineage_list_unique'
|
||||
, 'lineage_multimode' : 'lineage_multimode'}, inplace = True)
|
||||
|
||||
# set index for gene_LF4
|
||||
gene_LF4.index
|
||||
gene_LF4.reset_index(inplace=True)
|
||||
gene_LF4.index
|
||||
gene_LF4['mutationinformation']
|
||||
gene_LF4['index_orig_copy']
|
||||
gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
|
||||
gene_LF4.index
|
||||
lf_lin_split_ColSel.columns
|
||||
ncols_to_merge = len(lf_lin_split_ColSel.columns)
|
||||
|
||||
#-------------------------
|
||||
# colum lineage_ucount:
|
||||
# contribution of each distinct lineage
|
||||
#-------------------------
|
||||
gene_LF4['lineage_ucount'] = gene_LF4['lineage']
|
||||
#%% Final merge: gene_LF3 with lf_lin_split_ColSel: gene_LF4
|
||||
#===============================
|
||||
# merge: inner join by default
|
||||
# join: left join default
|
||||
# concat: outer join by default
|
||||
#df1.join(df2)
|
||||
#===============================
|
||||
len(lf_lin_split)
|
||||
len(gene_LF3)
|
||||
|
||||
#-------------------------
|
||||
# colum lineage list:
|
||||
#-------------------------
|
||||
#gene_LF4['lineage_set'] = gene_LF4['lineage']
|
||||
gene_LF4['lineage_ulist'] = gene_LF4['lineage']
|
||||
|
||||
gene_LF4['lineage_list'] = gene_LF4['lineage']
|
||||
|
||||
#-------------------------
|
||||
# colum lineage_list mode:
|
||||
#-------------------------
|
||||
gene_LF4['lineage_mode'] = gene_LF4['lineage']
|
||||
|
||||
# merge based on indices
|
||||
gene_LF4.index.nunique()
|
||||
lf_lin_split.index.nunique()
|
||||
all(gene_LF4.index.isin(lf_lin_split.index))
|
||||
all(lf_lin_split.index.isin(gene_LF4.index))
|
||||
gene_LF4.index
|
||||
lf_lin_split.index
|
||||
|
||||
if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
|
||||
print('\nPass: merging lineage_ucount with gene_LF4')
|
||||
# Dropping duplicates from lineage df
|
||||
lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()]
|
||||
lf_lin_split_U = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()]
|
||||
if len(lf_lin_split_U) == len(snps_only):
|
||||
print('\nPASS: Duplicate entries removed from lf_lin'
|
||||
, '\nReady to start the final merge to generate gene_LF4')
|
||||
else:
|
||||
sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!')
|
||||
print('\nFAIL: DF after duplicate removal numbers MISmatch ')
|
||||
|
||||
###########################
|
||||
# magic merge happens here
|
||||
###########################
|
||||
# FIXME: add check here to see if the duplicated indices rows are actual duplicates as the cols I need should be summary cols
|
||||
lf_lin_split.index.drop_duplicates(keep='first')
|
||||
lf_lin_split = lf_lin_split
|
||||
lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
|
||||
lf_lin_split_U.shape
|
||||
gene_LF3.index
|
||||
lf_lin_split_U.index
|
||||
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount']
|
||||
gene_LF4['lineage_ucount'].value_counts()
|
||||
#--------------------
|
||||
# Creating gene_LF4
|
||||
#--------------------
|
||||
if all(gene_LF3.index.isin(lf_lin_split_U.index)) :
|
||||
print('\nIndices match, staring final merge...')
|
||||
gene_LF4 = gene_LF3.join(lf_lin_split_U)
|
||||
if len(gene_LF4.columns) == len(gene_LF3.columns) + ncols_to_merge:
|
||||
print('\nPASS: gene_LF4 created after successfully merging with lineage info'
|
||||
, '\nShape of gene_LF4:', gene_LF4.shape)
|
||||
else:
|
||||
sys.exit('\nFAIL: gene_LF4 could not be created. Dfs could not be merged. Check indices or dfs length again!')
|
||||
|
||||
#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set']
|
||||
#gene_LF4['lineage_set'].value_counts()
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist']
|
||||
gene_LF4['lineage_ulist'].value_counts()
|
||||
#----------------------------------------
|
||||
# Dropping redundant cols from gene_LF4
|
||||
#----------------------------------------
|
||||
if all(gene_LF4.index == gene_LF4['Mut']) and gene_LF4['index_orig'].equals(gene_LF4['index_orig_copy']):
|
||||
gene_LF4.reset_index(inplace=True)
|
||||
#gene_LF4 = gene_LF4.set_index(['index_orig'])
|
||||
print('\nPass Mutationinformationa and index columns checked, the duplicates can now be dropped')
|
||||
gene_LF4 = gene_LF4.drop(['Mut_copy', 'index_orig_copy', 'mutation_info_v1' ], axis = 1)
|
||||
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list']
|
||||
gene_LF4['lineage_list'].value_counts()
|
||||
#%% Final output with relevant columns
|
||||
print('\nFinal output file: Dim of gene_LF4:', gene_LF4.shape)
|
||||
|
||||
gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode']
|
||||
gene_LF4['lineage_mode'].value_counts()
|
||||
# cols_to_output = ['mutationinformation', 'id', 'sample', 'lineage', 'sublineage',
|
||||
# 'country_code', 'drtype', 'pyrazinamide', 'mutation', 'drug_name',
|
||||
# 'mutation_info', 'mutation_info_orig', 'wild_type', 'mutant_type',
|
||||
# 'position', 'Mut', 'index_orig', 'pos_count', 'total_id_ucount',
|
||||
# 'snp_frequency', 'maf', 'drtype_numeric', 'drtype_all_vals',
|
||||
# 'drtype_all_names', 'drtype_multimode', 'drtype_mode', 'drtype_max',
|
||||
# 'mutation_info_labels', 'dm_om_numeric', 'dm_om_numeric_orig', 'dst',
|
||||
# 'dst_multimode', 'dst_multimode_all', 'dst_mode', 'lineage_list_all',
|
||||
# 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique',
|
||||
# 'lineage_multimode']
|
||||
#%% OUTFILE 3, write file mCSM style muts
|
||||
snps_only = pd.DataFrame(gene_LF4['mutationinformation'].unique())
|
||||
snps_only.head()
|
||||
# assign column name
|
||||
snps_only.columns = ['mutationinformation']
|
||||
|
||||
foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount'
|
||||
#, 'lineage_set'
|
||||
, 'lineage_ulist'
|
||||
, 'lineage_mode'
|
||||
, 'lineage_list']]
|
||||
#%%
|
||||
# count how many positions this corresponds to
|
||||
pos_only = pd.DataFrame(gene_LF4['position'].unique())
|
||||
pos_only
|
||||
|
||||
#Subset relevant columns for output and put the rest of the output here
|
||||
print('Checking NA in snps...')# should be 0
|
||||
if snps_only.mutationinformation.isna().sum() == 0:
|
||||
print ('PASS: NO NAs/missing entries for SNPs'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||
|
||||
# write file: mCSM muts
|
||||
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_mcsmsnps
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo. of distinct muts:', len(snps_only)
|
||||
, '\nNo. of distinct positions:', len(pos_only)
|
||||
, '\n=============================================================')
|
||||
|
||||
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
|
||||
|
||||
print('Finished writing:', outfile_mcsmsnps
|
||||
, '\nNo. of rows:', len(snps_only)
|
||||
, '\nNo. of cols:', len(snps_only.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_mcsmsnps)
|
||||
###############################################################################
|
||||
#%% OUTFILE 4, write file frequency of position count
|
||||
metadata_pos = pd.DataFrame(gene_LF4['position'])
|
||||
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
|
||||
metadata_pos['meta_pos_count'].value_counts()
|
||||
|
||||
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
|
||||
|
||||
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
|
||||
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: Metadata poscounts'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata_poscounts
|
||||
, '\n============================================================')
|
||||
|
||||
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata_poscounts
|
||||
, '\nNo. of rows:', len(metadata_pos)
|
||||
, '\nNo. of cols:', len(metadata_pos.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata_poscounts)
|
||||
###############################################################################
|
||||
#%% OUTFILE 5, write file <gene>_metadata
|
||||
# where each row has UNIQUE mutations NOT unique sample ids
|
||||
out_filename_metadata = gene.lower() + '_metadata.csv'
|
||||
outfile_metadata = outdir + '/' + out_filename_metadata
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: LF formatted data'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_metadata
|
||||
, '\n============================================================')
|
||||
|
||||
gene_LF4.to_csv(outfile_metadata, header = True, index = False)
|
||||
print('Finished writing:', outfile_metadata
|
||||
, '\nNo. of rows:', len(gene_LF4)
|
||||
, '\nNo. of cols:', len(gene_LF4.columns)
|
||||
, '\n=============================================================')
|
||||
del(out_filename_metadata)
|
||||
###############################################################################
|
||||
#%% OUTFILE 6, write file MSA plots
|
||||
#write file: mCSM style but with repitions for MSA and logo plots
|
||||
all_muts_msa = pd.DataFrame(gene_LF4['mutationinformation'])
|
||||
all_muts_msa.head()
|
||||
|
||||
# assign column name
|
||||
all_muts_msa.columns = ['mutationinformation']
|
||||
|
||||
# make sure it is string
|
||||
all_muts_msa.columns.dtype
|
||||
|
||||
# sort the column
|
||||
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
|
||||
|
||||
# create an extra column with protein name
|
||||
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
|
||||
#all_muts_msa_sorted.head()
|
||||
|
||||
# rearrange columns so the fasta name is the first column (required for mutate.script)
|
||||
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
|
||||
all_muts_msa_sorted.head()
|
||||
|
||||
print('Checking NA in snps...')# should be 0
|
||||
if all_muts_msa.mutationinformation.isna().sum() == 0:
|
||||
print ('PASS: NO NAs/missing entries for SNPs'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
|
||||
|
||||
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
|
||||
outfile_msa = outdir + '/' + out_filename_msa
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mCSM style muts for msa'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_msa
|
||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||
, '\nNo.of lines of msa:', len(all_muts_msa))
|
||||
|
||||
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
|
||||
|
||||
print('Finished writing:', outfile_msa
|
||||
, '\nNo. of rows:', len(all_muts_msa)
|
||||
, '\nNo. of cols:', len(all_muts_msa.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
del(out_filename_msa)
|
||||
###############################################################################
|
||||
#%% OUTFILE 7, write file mutational position with count
|
||||
# write file for mutational positions
|
||||
# count how many positions this corresponds to
|
||||
pos_only = pd.DataFrame(gene_LF4['position'].unique())
|
||||
# assign column name
|
||||
pos_only.columns = ['position']
|
||||
# make sure dtype of column position is int or numeric and not string
|
||||
pos_only.position.dtype
|
||||
pos_only['position'] = pos_only['position'].astype(int)
|
||||
pos_only.position.dtype
|
||||
|
||||
# sort by position value
|
||||
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
|
||||
|
||||
out_filename_pos = gene.lower() + '_mutational_positons.csv'
|
||||
outfile_pos = outdir + '/' + out_filename_pos
|
||||
|
||||
print('\n----------------------------------'
|
||||
, '\nWriting file: mutational positions'
|
||||
, '\n----------------------------------'
|
||||
, '\nFile:', outfile_pos
|
||||
, '\nNo. of distinct positions:', len(pos_only_sorted)
|
||||
, '\n=============================================================')
|
||||
|
||||
pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
|
||||
|
||||
print('Finished writing:', outfile_pos
|
||||
, '\nNo. of rows:', len(pos_only_sorted)
|
||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||
, '\n============================================================='
|
||||
, '\n\n\n')
|
||||
|
||||
del(out_filename_pos)
|
||||
###############################################################################
|
||||
#%% Quick summary output
|
||||
print('============================================'
|
||||
, '\nQuick summary output for', drug, 'and' , gene.lower()
|
||||
, '\n============================================'
|
||||
, '\nTotal samples:', total_samples
|
||||
, '\nNo. of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts()
|
||||
, '\n'
|
||||
, '\nPercentage of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
|
||||
, '\n'
|
||||
|
||||
, '\nNo. of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts()
|
||||
, '\n'
|
||||
, '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
|
||||
, '\n'
|
||||
|
||||
, '\nTotal no. of unique snps:', len(snps_only)
|
||||
, '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
|
||||
|
||||
, '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD
|
||||
, '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD
|
||||
|
||||
|
||||
, '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
|
||||
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
|
||||
, '\nTotal no. of samples with missense muts:', len(gene_LF4)
|
||||
, '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()
|
||||
, '\n')
|
||||
|
||||
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
|
||||
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
|
||||
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
|
||||
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
|
||||
, '\n============================================================='
|
||||
, '\nPost resolving ambiguity\n'
|
||||
, ambig_muts_rev_df['mutation_info_REV'].value_counts())
|
||||
#=======================================================================
|
||||
print(u'\u2698' * 50,
|
||||
'\nEnd of script: Data extraction and writing files'
|
||||
'\n' + u'\u2698' * 50 )
|
||||
#%% end of script
|
Loading…
Add table
Add a link
Reference in a new issue