done but getting an error when running from cmd

This commit is contained in:
Tanushree Tunstall 2022-04-26 17:03:40 +01:00
parent 29e9d10e39
commit ac0d14e116

710
scripts/data_extraction.py Normal file → Executable file
View file

@ -314,7 +314,7 @@ print('===========================================================\n'
#meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ] #meta_data_gene = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
# so downstream code doesn't change # so downstream code doesn't change
meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ] meta_gene_all = meta_data.loc[ (meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) | (meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ) ]
#%% DF: with dr_muts_col #%% DF: with dr_muts_col, meta_gene_dr
meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] meta_gene_dr = meta_gene_all.loc[meta_gene_all[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] meta_gene_dr1 = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
@ -327,7 +327,7 @@ else:
, '\nshape of df2:', meta_gene_dr1.shape , '\nshape of df2:', meta_gene_dr1.shape
, '\nCheck again!') , '\nCheck again!')
############################################################################## ##############################################################################
#%% DF: with other_muts_col #%% DF: with other_muts_col, other_gene_dr
meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] meta_gene_other = meta_gene_all.loc[meta_gene_all[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] meta_gene_other1 = meta_data.loc[meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)]
@ -365,11 +365,11 @@ print('===========================================================\n'
, '\n===========================================================') , '\n===========================================================')
############################################################################### ###############################################################################
#%% Create a copy of indices for downstream mergeing #%% Create a copy of indices for downstream mergeing
meta_gene_all['index_orig'] = meta_gene_all.index #meta_gene_all['index_orig'] = meta_gene_all.index
meta_gene_all['index_orig_copy'] = meta_gene_all.index #meta_gene_all['index_orig_copy'] = meta_gene_all.index
all(meta_gene_all.index.values == meta_gene_all['index_orig'].values) #all(meta_gene_all.index.values == meta_gene_all['index_orig'].values)
all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values) #all(meta_gene_all.index.values == meta_gene_all['index_orig_copy'].values)
############################################################################## ##############################################################################
#%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc. #%% Important sanity checks: Dr muts column for tidy split(), nsSNPs, etc.
# Split based on semi colon dr_muts_col # Split based on semi colon dr_muts_col
@ -456,13 +456,13 @@ del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
# DF1: dr_muts_col # DF1: dr_muts_col
# and remove leading white spaces # and remove leading white spaces
#========= #=========
col_to_split1 = dr_muts_col #col_to_split1 = dr_muts_col
print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape
, '\ncolumn name to apply tidy_split():' , col_to_split1 , '\ncolumn name to apply tidy_split():' , dr_muts_col
, '\n============================================================') , '\n============================================================')
# apply tidy_split() # apply tidy_split()
dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') dr_WF0 = tidy_split(meta_gene_dr, dr_muts_col, sep = ';')
# remove leading white space else these are counted as distinct mutations as well # remove leading white space else these are counted as distinct mutations as well
dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip()
@ -579,13 +579,13 @@ del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt )
# DF2: other_muts_col # DF2: other_muts_col
# and remove leading white spaces # and remove leading white spaces
#========= #=========
col_to_split2 = other_muts_col #col_to_split2 = other_muts_col
print ('applying second tidy split() separately on other muts df', meta_gene_other.shape print ('applying second tidy split() separately on other muts df', meta_gene_other.shape
, '\ncolumn name to apply tidy_split():', col_to_split2 , '\ncolumn name to apply tidy_split():', other_muts_col
, '\n============================================================') , '\n============================================================')
# apply tidy_split() # apply tidy_split()
other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') other_WF1 = tidy_split(meta_gene_other, other_muts_col, sep = ';')
# remove the leading white spaces in the column # remove the leading white spaces in the column
other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip()
@ -683,7 +683,7 @@ SnpFDict = lower_dict(SnpFDict_orig)
############################################################################### ###############################################################################
# USE Vcounts to get expected curated df # USE Vcounts to get expected curated df
# resolve dm om muts funda # resolve dm om muts funda
#%%#%% Concatenating two dfs: gene_LF0 #%% Concatenating two dfs: gene_LF0
# equivalent of rbind in R # equivalent of rbind in R
#========== #==========
# Concatentating the two dfs: equivalent of rbind in R # Concatentating the two dfs: equivalent of rbind in R
@ -922,13 +922,20 @@ ambig_muts_rev_df = pd.DataFrame()
changes_val = [] changes_val = []
changes_dict = {} changes_dict = {}
for i in common_muts: ##BROKENNNN!!!!
#print(i)
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
common_muts_lower = list((map(lambda x: x.lower(), common_muts)))
common_muts_lower
##BROKENNNN!!!!
#for i in common_muts:
for i in common_muts_lower:
print(i)
temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']]
temp_df
# DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending
max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts() max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info_orig']].value_counts()
max_info_table
revised_label = max_info_table[[0]].index[0][1] # max value_count revised_label = max_info_table[[0]].index[0][1] # max value_count
old_label = max_info_table[[1]].index[0][1] # min value_count old_label = max_info_table[[1]].index[0][1] # min value_count
print('\nAmbiguous mutation labels...' print('\nAmbiguous mutation labels...'
@ -948,12 +955,12 @@ ambig_muts_rev_df['mutation_info_orig'].value_counts()
changes_val changes_val
changes_total = sum(changes_val) changes_total = sum(changes_val)
changes_dict changes_dict
#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts #%% OUTFILE 1, write file: ambiguous muts and ambiguous mut counts
#=================== #==================
# ambiguous muts # ambiguous muts
#======================= #==================
#dr_muts.to_csv('dr_muts.csv', header = True) #dr_muts.XXX_csvXXXX('dr_muts.csv', header = True)
#other_muts.to_csv('other_muts.csv', header = True) #other_muts.XXXX_csvXXX('other_muts.csv', header = True)
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
@ -972,19 +979,20 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
, '\n=============================================================') , '\n=============================================================')
del(out_filename_ambig_muts) del(out_filename_ambig_muts)
#%% FIXME: ambig mut counts #%% OUTFILE 2, write file: ambiguous mut counts
#======================= #======================
# ambiguous mut counts # ambiguous mut counts
#======================= #======================
out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv'
outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts
print('\n----------------------------------' print('\n----------------------------------'
, '\nWriting file: ambiguous muts' , '\nWriting file: ambiguous mut counts'
, '\n----------------------------------' , '\n----------------------------------'
, '\nFilename:', outfile_ambig_mut_counts) , '\nFilename:', outfile_ambig_mut_counts)
ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True)
#%%FIXME: TODO: Add sanity check to make sure you can add value_count checks #%% FIXME: Add sanity check to make sure you can add value_count checks
print('\nREACHED here...................>>>>>>>')
#%% Resolving ambiguous muts #%% Resolving ambiguous muts
# Merging ambiguous muts # Merging ambiguous muts
#================= #=================
@ -995,6 +1003,12 @@ ambig_muts_rev_df.index
gene_LF1.index gene_LF1.index
all(ambig_muts_rev_df.index.isin(gene_LF1.index)) all(ambig_muts_rev_df.index.isin(gene_LF1.index))
any(gene_LF1.index.isin(ambig_muts_rev_df.index))
if(gene_LF1.index.unique().isin(ambig_muts_rev_df.index).sum() == len(ambig_muts_rev_df)):
print('\nPASS: ambiguous mut indices present in gene_LF1. Prepare to merge...')
else:
sys.exit('\nFAIL:ambiguous mut indices MISmatch. Check section Resolving ambiguous muts')
#gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV'] #gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info_v1'] = ambig_muts_rev_df['mutation_info_REV']
gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] gene_LF1.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV']
@ -1024,9 +1038,9 @@ gene_LF1['mutation_info'].value_counts()
#%% PHEW! Good to go for downstream stuff #%% PHEW! Good to go for downstream stuff
#%% Add column: Mutationinformation #%% Add column: Mutationinformation
# splitting mutation column to get mCSM style muts # splitting mutation column to get mCSM style muts
#======================= #=====================================================
# Formatting df: read aa dict and pull relevant info # Formatting df: read aa dict and pull relevant info
#======================= #=====================================================
print('Now some more formatting:' print('Now some more formatting:'
, '\nRead aa dict and pull relevant info' , '\nRead aa dict and pull relevant info'
, '\nFormat mutations:' , '\nFormat mutations:'
@ -1040,12 +1054,13 @@ print('Now some more formatting:'
ncol_mutf_add = 3 # mut split into 3 cols ncol_mutf_add = 3 # mut split into 3 cols
ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping
#=========== #======================================================================
# Split 'mutation' column into three: wild_type, position and # Split 'mutation' column into three: wild_type, position and
# mutant_type separately. Then map three letter code to one using # mutant_type separately. Then map three letter code to one using
# reference_dict imported at the beginning. # reference_dict imported at the beginning.
# After importing, convert to mutation to lowercase for compatibility with dict # After importing, convert to mutation to lowercase for
#=========== # compatibility with dict
#=======================================================================
gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower()
print('wt regex being used:', wt_regex print('wt regex being used:', wt_regex
@ -1054,14 +1069,15 @@ print('wt regex being used:', wt_regex
mylen0 = len(gene_LF1.columns) mylen0 = len(gene_LF1.columns)
#======= #=========================================================
# Iterate through the dict, create a lookup dict i.e # Iterate through the dict, create a lookup dict i.e
# lookup_dict = {three_letter_code: one_letter_code}. # lookup_dict = {three_letter_code: one_letter_code}.
# lookup dict should be the key and the value (you want to create a column for) # lookup dict should be the key and the value (you want to create a column for)
# Then use this to perform the mapping separetly for wild type and mutant cols. # Then use this to perform the mapping separetly for wild type and mutant cols.
# The three letter code is extracted using a string match match from the dataframe and then converted # The three letter code is extracted using a string match match from the
# dataframe and then converted
# to 'pandas series'since map only works in pandas series # to 'pandas series'since map only works in pandas series
#======= #===========================================================
print('Adding', ncol_mutf_add, 'more cols:\n') print('Adding', ncol_mutf_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to 1-letter code # initialise a sub dict that is lookup dict for three letter code to 1-letter code
@ -1076,7 +1092,11 @@ for k, v in my_aa_dict.items():
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mutant_type'] = mut.map(lookup_dict) gene_LF1['mutant_type'] = mut.map(lookup_dict)
# extract position info from mutation column separetly using string match #-------------------
# extract position
# info from mutation
# column separetly using string match
#-------------------
#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') #gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)')
gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex)
@ -1109,11 +1129,11 @@ else:
# clear variables # clear variables
del(k, v, wt, mut, lookup_dict) del(k, v, wt, mut, lookup_dict)
######## ##########################################################################
# combine the wild_type+poistion+mutant_type columns to generate # combine the wild_type+poistion+mutant_type columns to generate
# mutationinformation (matches mCSM output field) # mutationinformation (matches mCSM output field)
# Remember to use .map(str) for int col types to allow string concatenation # Remember to use .map(str) for int col types to allow string concatenation
######### ###########################################################################
gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type']
print('Created column: mutationinformation' print('Created column: mutationinformation'
, '\n=====================================================================\n' , '\n=====================================================================\n'
@ -1138,7 +1158,6 @@ bar = gene_LF1['position'].value_counts()
#else: #else:
# print('FAIL: df could not be ordered. Check source') # print('FAIL: df could not be ordered. Check source')
# sys.exit() # sys.exit()
#%% Create a copy of mutationinformation column for downstream mergeing #%% Create a copy of mutationinformation column for downstream mergeing
gene_LF1['Mut'] = gene_LF1['mutationinformation'] gene_LF1['Mut'] = gene_LF1['mutationinformation']
gene_LF1['Mut_copy'] = gene_LF1['mutationinformation'] gene_LF1['Mut_copy'] = gene_LF1['mutationinformation']
@ -1150,7 +1169,7 @@ gene_LF1['index_orig_copy'] = gene_LF1.index
all(gene_LF1.index.values == gene_LF1['index_orig'].values) all(gene_LF1.index.values == gene_LF1['index_orig'].values)
all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values) all(gene_LF1.index.values == gene_LF1['index_orig_copy'].values)
#%% quick sanity check for position freq count #%% Quick sanity check for position freq count
# count the freq of 'other muts' samples # count the freq of 'other muts' samples
test_df = gene_LF1.copy() test_df = gene_LF1.copy()
test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']] test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'position']]
@ -1158,6 +1177,7 @@ test_df = test_df[['id','index_orig', 'mutationinformation', 'mutation', 'positi
#test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count') #test_df['sample_freq'] = test_df.groupby('id')['id'].transform('count')
#print('Revised dim of other_muts_df:',test_df.shape) #print('Revised dim of other_muts_df:',test_df.shape)
test_df['scount'] = test_df['mutation'].map(SnpFDict) test_df['scount'] = test_df['mutation'].map(SnpFDict)
test_df = test_df.sort_values(['position', 'mutationinformation'])
#%% Map mutation frequency count as a column #%% Map mutation frequency count as a column
gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict) gene_LF1['snp_frequency'] = gene_LF1['mutation'].map(SnpFDict)
#%% Map position frequency count as a column #%% Map position frequency count as a column
@ -1166,75 +1186,151 @@ z1 = z.to_dict()
gene_LF1['pos_count'] = gene_LF1['position'].map(z1) gene_LF1['pos_count'] = gene_LF1['position'].map(z1)
#test_df2 = test_df.loc[test_df['position'] == 10] #test_df2 = test_df.loc[test_df['position'] == 10]
###############################################################################
#%% OUTFILE 4, write file mCSM style muts ###############################################################################
snps_only = pd.DataFrame(gene_LF1['mutationinformation'].unique())
snps_only.head()
# assign column name
snps_only.columns = ['mutationinformation']
# count how many positions this corresponds to
pos_only = pd.DataFrame(gene_LF1['position'].unique())
pos_only
print('Checking NA in snps...')# should be 0
if snps_only.mutationinformation.isna().sum() == 0:
print ('PASS: NO NAs/missing entries for SNPs'
, '\n===============================================================')
else:
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
# write file: mCSM muts
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
print('\n----------------------------------'
, '\nWriting file: mCSM style muts'
, '\n----------------------------------'
, '\nFile:', outfile_mcsmsnps
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only)
, '\nNo. of distinct positions:', len(pos_only)
, '\n=============================================================')
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
print('Finished writing:', outfile_mcsmsnps
, '\nNo. of rows:', len(snps_only)
, '\nNo. of cols:', len(snps_only.columns)
, '\n=============================================================')
del(out_filename_mcsmsnps)
#%% OUTFILE 5, write file frequency of position counts: MOVE TO THE END
metadata_pos = pd.DataFrame(gene_LF1['position'])
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
metadata_pos['meta_pos_count'].value_counts()
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('\n----------------------------------'
, '\nWriting file: Metadata poscounts'
, '\n----------------------------------'
, '\nFile:', outfile_metadata_poscounts
, '\n============================================================')
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
print('Finished writing:', outfile_metadata_poscounts
, '\nNo. of rows:', len(metadata_pos)
, '\nNo. of cols:', len(metadata_pos.columns)
, '\n=============================================================')
del(out_filename_metadata_poscounts)
#%% OUTFILE 6, write file <gene>_metadata: MOVE TO THE END
# ---THINK ---
#%% OUTFILE 7, write file MSA plots: MOVE TO THE END
# -- THINK ---
#%% OUTFILE 8, write file mutational position with count: MOVE TO THE END
# -- THINK ---
#%% Add column: aa property_water #%% Add column: aa property_water
#=========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_water}
# Do this for both wild_type and mutant as above.
#=========
print('Adding', ncol_aa_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to aa prop
# adding two more cols
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_water']
#print(lookup_dict)
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wt_prop_water'] = wt.map(lookup_dict)
#mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mut_prop_water'] = mut.map(lookup_dict)
mylen2 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen2 == mylen1 + ncol_aa_add:
print('PASS: successfully added', ncol_aa_add, 'cols'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% Add column: aa_prop_polarity #%% Add column: aa_prop_polarity
#========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_prop_polarity}
# Do this for both wild_type and mutant as above.
#=========
print('Adding', ncol_aa_add, 'more cols:\n')
# initialise a sub dict that is lookup dict for three letter code to aa prop
# adding two more cols
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_prop_polarity']
#print(lookup_dict)
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
#mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
mylen3 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen3 == mylen2 + ncol_aa_add:
print('PASS: successfully added', ncol_aa_add, 'cols'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen1
, '\nnew len:', mylen2)
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% Add column: aa_calcprop #%% Add column: aa_calcprop
#========
# iterate through the dict, create a lookup dict that i.e
# lookup_dict = {three_letter_code: aa_calcprop}
# Do this for both wild_type and mutant as above.
#=========
print('Adding', ncol_aa_add, 'more cols:\n')
lookup_dict = dict()
for k, v in my_aa_dict.items():
lookup_dict[k] = v['aa_calcprop']
#print(lookup_dict)
wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze()
gene_LF1['wt_calcprop'] = wt.map(lookup_dict)
mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze()
gene_LF1['mut_calcprop'] = mut.map(lookup_dict)
mylen4 = len(gene_LF1.columns)
# sanity checks
print('checking if 3-letter wt&mut residue extraction worked correctly')
if wt.isna().sum() & mut.isna().sum() == 0:
print('PASS: 3-letter wt&mut residue extraction worked correctly:'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
else:
print('FAIL: 3-letter wt&mut residue extraction failed'
, '\nNo NAs detected:'
, '\nwild-type\n', wt
, '\nmutant-type\n', mut
, '\ndim of df:', gene_LF1.shape)
if mylen4 == mylen3 + ncol_aa_add:
print('PASS: successfully added', ncol_aa_add, 'cols'
, '\nold length:', mylen3
, '\nnew len:', mylen4)
else:
print('FAIL: failed to add cols:'
, '\nold length:', mylen3
, '\nnew len:', mylen4)
# clear variables
del(k, v, wt, mut, lookup_dict)
#%% NEW mappings: gene_LF2 #%% NEW mappings: gene_LF2
# gene_LF2: copy gene_LF1 # gene_LF2: copy gene_LF1
gene_LF2 = gene_LF1.copy() gene_LF2 = gene_LF1.copy()
@ -1267,6 +1363,8 @@ gene_LF2['total_id_ucount'] = total_id_ucount
# gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique()) # gene_LF2['mut_id_ucount'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
# gene_LF2['mut_id_ucount'] # gene_LF2['mut_id_ucount']
gene_LF2['snp_frequency'] = gene_LF2['mutationinformation'].map(gene_LF2.groupby('mutationinformation')['id'].nunique())
# gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount']) # gene_LF1['snp_frequency'].equals(gene_LF2['mut_id_ucount'])
#%% AF for gene #%% AF for gene
@ -1275,7 +1373,7 @@ gene_LF2['total_id_ucount'] = total_id_ucount
#================= #=================
gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount'] gene_LF2['maf'] = gene_LF2['snp_frequency']/gene_LF2['total_id_ucount']
gene_LF2['maf'].head() gene_LF2['maf'].head()
###############################################################################
#%% Further mappings: gene_LF3, with mutationinformation as INDEX #%% Further mappings: gene_LF3, with mutationinformation as INDEX
gene_LF3 = gene_LF2.copy() gene_LF3 = gene_LF2.copy()
@ -1324,6 +1422,7 @@ gene_LF3['drtype_all_vals'].head()
gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list) gene_LF3['drtype_all_names'] = gene_LF3.groupby('mutationinformation').drtype_all_names.apply(list)
gene_LF3['drtype_all_names'].head() gene_LF3['drtype_all_names'].head()
#--------------------------------- #---------------------------------
# Revised drtype: max(Multimode) # Revised drtype: max(Multimode)
#-------------------------------- #--------------------------------
@ -1406,6 +1505,7 @@ gene_LF3['dst_multimode_all']
# Fill using multimode ONLY where NA in dst_multimode column # Fill using multimode ONLY where NA in dst_multimode column
gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3) gene_LF3['dst_multimode'] = gene_LF3['dst_multimode'].fillna(dm_om_multimode_LF3)
gene_LF3['dst_multimode'] gene_LF3['dst_multimode']
gene_LF3['dst_multimode'].value_counts()
#---------------------------------- #----------------------------------
# Revised dst column: max of mode # Revised dst column: max of mode
@ -1422,6 +1522,11 @@ gene_LF3[drug].value_counts()
#gene_LF3['dst_noNA'].value_counts() #gene_LF3['dst_noNA'].value_counts()
gene_LF3['dst_mode'].value_counts() gene_LF3['dst_mode'].value_counts()
if (gene_LF3['dst_mode'].value_counts().sum() == len(gene_LF3)) and (gene_LF3['dst_mode'].value_counts()[1] > gene_LF3[drug].value_counts()[1]) and gene_LF3['dst_mode'].value_counts()[0] > gene_LF3[drug].value_counts()[0]:
print('\nPASS: revised dst mode created successfully and the counts are more than <drug> col count')
else:
print('\nFAIL: revised dst mode numbers MISmatch')
foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']] foo = gene_LF3[['Mut', 'position', 'dst', 'dst_multimode', 'dst_noNA', 'dst_mode']]
foo2 = foo.sort_values(['position', 'Mut']) foo2 = foo.sort_values(['position', 'Mut'])
@ -1460,6 +1565,7 @@ gene_LF3['mutation_info_labels_v1'].value_counts() == gene_LF3['mutation_info_la
# Now overwrite # Now overwrite
gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) gene_LF3['mutation_info_labels_dst'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'}) gene_LF3['mutation_info_labels'] = gene_LF3['dst_mode'].map({1: 'DM', 0: 'OM'})
if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()): if all(gene_LF3['mutation_info_labels_dst'].value_counts() == gene_LF3['mutation_info_labels'].value_counts()):
print('\nPASS: Revised mutation_info column created successfully') print('\nPASS: Revised mutation_info column created successfully')
gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1) gene_LF3 = gene_LF3.drop(['mutation_info_labels_dst'], axis = 1)
@ -1480,6 +1586,7 @@ gene_LF3['mutation_info_labels_orig'].value_counts()
# %% sanity check for the revised dst # %% sanity check for the revised dst
gene_LF3[drug].value_counts() gene_LF3[drug].value_counts()
gene_LF3[drug].value_counts().sum() gene_LF3[drug].value_counts().sum()
gene_LF3['mutation_info_labels_orig'].value_counts() gene_LF3['mutation_info_labels_orig'].value_counts()
gene_LF3['dst_mode'].value_counts() gene_LF3['dst_mode'].value_counts()
@ -1488,6 +1595,8 @@ gene_LF3['dst_mode'].value_counts().sum()
# direct comparision # direct comparision
gene_LF3['dst'].value_counts() gene_LF3['dst'].value_counts()
gene_LF3['mutation_info_labels'].value_counts() gene_LF3['mutation_info_labels'].value_counts()
gene_LF3['mutation_info_labels_v1'].value_counts()
#%% Lineage #%% Lineage
gene_LF3['lineage'].value_counts() gene_LF3['lineage'].value_counts()
# lineage_label_numeric = {'lineage1' : 1 # lineage_label_numeric = {'lineage1' : 1
@ -1509,24 +1618,52 @@ lineage_label_numeric = {'L1' : 1
, 'LBOV' : 8} , 'LBOV' : 8}
lineage_label_numeric lineage_label_numeric
#%% Lineage cols selected: gene_LF3_ColsSel
gene_LF3_ColsSel = gene_LF3.copy()
gene_LF3_ColsSel.index
gene_LF3_ColsSel.columns
#gene_LF3_ColsSel['index_orig_copy'] = gene_LF3_ColsSel['index_orig'] # delete
# copy column to allow cross checks after stripping white space # copy column to allow cross checks after stripping white space
gene_LF3['lineage'] = gene_LF3['lineage'].str.strip() gene_LF3_ColsSel['lineage'] = gene_LF3_ColsSel['lineage'].str.strip()
gene_LF3['lineage_corrupt'] = gene_LF3['lineage'] gene_LF3_ColsSel['lineage_corrupt'] = gene_LF3_ColsSel['lineage']
#all(gene_LF3['lineage_corrupt'].value_counts() ==gene_LF3['lineage'].value_counts()) all(gene_LF3_ColsSel['lineage_corrupt'].value_counts() == gene_LF3_ColsSel['lineage'].value_counts())
gene_LF3['lineage_corrupt'].value_counts() gene_LF3_ColsSel['lineage_corrupt'].value_counts()
gene_LF3_ColsSel = gene_LF3_ColsSel[['index_orig_copy', 'Mut', 'position', 'snp_frequency', 'lineage', 'lineage_corrupt']]
gene_LF3_ColsSel.columns
#%% tidy_split(): Lineage #%% tidy_split(): Lineage
# Create df with tidy_split: lineage # Create df with tidy_split: lineage
lf_lin_split = tidy_split(gene_LF3, 'lineage_corrupt', sep = ';') lf_lin_split = tidy_split(gene_LF3_ColsSel, 'lineage_corrupt', sep = ';')
lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip() lf_lin_split['lineage_corrupt'] = lf_lin_split['lineage_corrupt'].str.strip()
lf_lin_split['lineage_corrupt'].value_counts() lf_lin_split['lineage_corrupt'].value_counts()
#%% Important sanity check for tidy_split(): lineage
# Split based on semi colon dr_muts_col
search = ";"
# Map lineage labels to numbers to allow metrics # count of occurrence of ";" in dr_muts_col: No.of semicolons + 1 is no. of rows created * occurence
count_df_lin = gene_LF3_ColsSel[['lineage']]
count_df_lin['lineage_semicolon_count'] = gene_LF3_ColsSel.loc[:, 'lineage'].str.count(search, re.I)
lin_sc_C = count_df_lin['lineage_semicolon_count'].value_counts().reset_index()
lin_sc_C
lin_sc_C['index_semicolon'] = (lin_sc_C['index'] + 1) * lin_sc_C['lineage_semicolon_count']
lin_sc_C
expected_linC = lin_sc_C['index_semicolon'].sum() + gene_LF3_ColsSel['lineage'].isnull().sum()
expected_linC
if (len(lf_lin_split) == expected_linC):
print('\nPASS: tidy_split() length match for lineage')
else:
sys.exit('\nFAIL: tidy_split() length MISMATCH. Check lineage semicolon count')
###############################################################################
#%% Map lineage labels to numbers to allow metrics
lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric) lf_lin_split['lineage_numeric'] = lf_lin_split['lineage_corrupt'].map(lineage_label_numeric)
lf_lin_split['lineage_numeric'].value_counts() lf_lin_split['lineage_numeric'].value_counts()
#-------------------------------- #--------------------------------
# Lineage_corrupt ALL values: # Add lineage_list: ALL values:
#-------------------------------- #--------------------------------
# Add all lineages for each mutation # Add all lineages for each mutation
lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt'] lf_lin_split['lineage_corrupt_list'] = lf_lin_split['lineage_corrupt']
@ -1535,12 +1672,21 @@ lf_lin_split['lineage_corrupt_list'].value_counts()
lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list) lf_lin_split['lineage_corrupt_list'] = lf_lin_split.groupby('mutationinformation').lineage_corrupt_list.apply(list)
lf_lin_split['lineage_corrupt_list'].value_counts() lf_lin_split['lineage_corrupt_list'].value_counts()
#--------------------------------
# Add lineage count: ALL
#--------------------------------
lf_lin_split['lineage_corrupt_count'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : len(list(x)))
#--------------------------------
# Add lineage unique count # Add lineage unique count
#--------------------------------
lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique()) lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['Mut'].map(lf_lin_split.groupby('Mut')['lineage_corrupt'].nunique())
#lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt'] #lf_lin_split['lineage_corrupt_ucount'] = lf_lin_split['lineage_corrupt']
lf_lin_split['lineage_corrupt_ucount'].value_counts() lf_lin_split['lineage_corrupt_ucount'].value_counts()
#--------------------------------
# Add lineage_set # Add lineage_set
#--------------------------------
lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x))) lf_lin_split['lineage_set'] = lf_lin_split['lineage_corrupt_list'].apply(lambda x : set(list(x)))
lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x)) lf_lin_split['lineage_ulist'] = lf_lin_split['lineage_set'].apply(lambda x : list(x))
@ -1551,95 +1697,275 @@ lf_lin_split['lineage_multimode'] = lf_lin_split.groupby('mutationinformation')[
lf_lin_split['lineage_multimode'].value_counts() lf_lin_split['lineage_multimode'].value_counts()
# cant take max as it doesn't mean anyting! # cant take max as it doesn't mean anyting!
foo = lf_lin_split[['Mut', 'lineage_ulist']]
############################################################################### ###############################################################################
#%% Final merge: gene_LF4 with lineage_split_df #%% Select only the columns you want to merge from lf_lin_split
gene_LF4 = gene_LF3.copy() lf_lin_split.columns
gene_LF4.index lf_lin_split_ColSel = lf_lin_split[['lineage_corrupt_list','lineage_corrupt_count'
gene_LF4['index_orig_copy'] = gene_LF4['index_orig'] , 'lineage_corrupt_ucount' ,'lineage_ulist', 'lineage_multimode']]
lf_lin_split_ColSel.columns
lf_lin_split['index_orig_copy'] = lf_lin_split['index_orig'] lf_lin_split_ColSel.rename(columns = {'lineage_corrupt_list' : 'lineage_list_all'
#================================ , 'lineage_corrupt_count' : 'lineage_count_all'
# Merge with gene_LF3 with , 'lineage_corrupt_ucount' : 'lineage_count_unique'
# lf_lin_split baseed on index , 'lineage_ulist' : 'lineage_list_unique'
#================================ , 'lineage_multimode' : 'lineage_multimode'}, inplace = True)
# set index for lf_lin_split
lf_lin_split.index
lf_lin_split.reset_index(inplace=True)
lf_lin_split['mutationinformation']
lf_lin_split = lf_lin_split.set_index(['index_orig_copy'])
lf_lin_split.index
# set index for gene_LF4 lf_lin_split_ColSel.columns
gene_LF4.index ncols_to_merge = len(lf_lin_split_ColSel.columns)
gene_LF4.reset_index(inplace=True)
gene_LF4.index
gene_LF4['mutationinformation']
gene_LF4['index_orig_copy']
gene_LF4 = gene_LF4.set_index(['index_orig_copy'])
gene_LF4.index
#------------------------- #%% Final merge: gene_LF3 with lf_lin_split_ColSel: gene_LF4
# colum lineage_ucount: #===============================
# contribution of each distinct lineage # merge: inner join by default
#------------------------- # join: left join default
gene_LF4['lineage_ucount'] = gene_LF4['lineage'] # concat: outer join by default
#df1.join(df2)
#===============================
len(lf_lin_split)
len(gene_LF3)
#------------------------- # Dropping duplicates from lineage df
# colum lineage list: lf_lin_split_dups = lf_lin_split_ColSel[lf_lin_split_ColSel.index.duplicated()]
#------------------------- lf_lin_split_U = lf_lin_split_ColSel[~lf_lin_split_ColSel.index.duplicated()]
#gene_LF4['lineage_set'] = gene_LF4['lineage'] if len(lf_lin_split_U) == len(snps_only):
gene_LF4['lineage_ulist'] = gene_LF4['lineage'] print('\nPASS: Duplicate entries removed from lf_lin'
, '\nReady to start the final merge to generate gene_LF4')
gene_LF4['lineage_list'] = gene_LF4['lineage']
#-------------------------
# colum lineage_list mode:
#-------------------------
gene_LF4['lineage_mode'] = gene_LF4['lineage']
# merge based on indices
gene_LF4.index.nunique()
lf_lin_split.index.nunique()
all(gene_LF4.index.isin(lf_lin_split.index))
all(lf_lin_split.index.isin(gene_LF4.index))
gene_LF4.index
lf_lin_split.index
if (gene_LF4.index.nunique() == lf_lin_split.index.nunique()) and ( all(gene_LF4.index.isin(lf_lin_split.index)) == all(lf_lin_split.index.isin(gene_LF4.index)) ):
print('\nPass: merging lineage_ucount with gene_LF4')
else: else:
sys.exit('\nFail: Indices mismatch, cannot merge! Quitting!') print('\nFAIL: DF after duplicate removal numbers MISmatch ')
########################### gene_LF3.index
# magic merge happens here lf_lin_split_U.index
###########################
# FIXME: add check here to see if the duplicated indices rows are actual duplicates as the cols I need should be summary cols
lf_lin_split.index.drop_duplicates(keep='first')
lf_lin_split = lf_lin_split
lf_lin_split_U = lf_lin_split[~lf_lin_split.index.duplicated(keep='first')]
lf_lin_split_U.shape
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ucount'] = lf_lin_split_U['lineage_corrupt_ucount'] #--------------------
gene_LF4['lineage_ucount'].value_counts() # Creating gene_LF4
#--------------------
if all(gene_LF3.index.isin(lf_lin_split_U.index)) :
print('\nIndices match, staring final merge...')
gene_LF4 = gene_LF3.join(lf_lin_split_U)
if len(gene_LF4.columns) == len(gene_LF3.columns) + ncols_to_merge:
print('\nPASS: gene_LF4 created after successfully merging with lineage info'
, '\nShape of gene_LF4:', gene_LF4.shape)
else:
sys.exit('\nFAIL: gene_LF4 could not be created. Dfs could not be merged. Check indices or dfs length again!')
#gene_LF4.loc[lf_lin_split_U.index, 'lineage_set'] = lf_lin_split_U['lineage_set'] #----------------------------------------
#gene_LF4['lineage_set'].value_counts() # Dropping redundant cols from gene_LF4
gene_LF4.loc[lf_lin_split_U.index, 'lineage_ulist'] = lf_lin_split_U['lineage_ulist'] #----------------------------------------
gene_LF4['lineage_ulist'].value_counts() if all(gene_LF4.index == gene_LF4['Mut']) and gene_LF4['index_orig'].equals(gene_LF4['index_orig_copy']):
gene_LF4.reset_index(inplace=True)
#gene_LF4 = gene_LF4.set_index(['index_orig'])
print('\nPass Mutationinformationa and index columns checked, the duplicates can now be dropped')
gene_LF4 = gene_LF4.drop(['Mut_copy', 'index_orig_copy', 'mutation_info_v1' ], axis = 1)
gene_LF4.loc[lf_lin_split_U.index, 'lineage_list'] = lf_lin_split_U['lineage_corrupt_list'] #%% Final output with relevant columns
gene_LF4['lineage_list'].value_counts() print('\nFinal output file: Dim of gene_LF4:', gene_LF4.shape)
gene_LF4.loc[lf_lin_split_U.index, 'lineage_mode'] = lf_lin_split_U['lineage_multimode'] # cols_to_output = ['mutationinformation', 'id', 'sample', 'lineage', 'sublineage',
gene_LF4['lineage_mode'].value_counts() # 'country_code', 'drtype', 'pyrazinamide', 'mutation', 'drug_name',
# 'mutation_info', 'mutation_info_orig', 'wild_type', 'mutant_type',
# 'position', 'Mut', 'index_orig', 'pos_count', 'total_id_ucount',
# 'snp_frequency', 'maf', 'drtype_numeric', 'drtype_all_vals',
# 'drtype_all_names', 'drtype_multimode', 'drtype_mode', 'drtype_max',
# 'mutation_info_labels', 'dm_om_numeric', 'dm_om_numeric_orig', 'dst',
# 'dst_multimode', 'dst_multimode_all', 'dst_mode', 'lineage_list_all',
# 'lineage_count_all', 'lineage_count_unique', 'lineage_list_unique',
# 'lineage_multimode']
#%% OUTFILE 3, write file mCSM style muts
snps_only = pd.DataFrame(gene_LF4['mutationinformation'].unique())
snps_only.head()
# assign column name
snps_only.columns = ['mutationinformation']
foo = gene_LF4[['mutationinformation', 'lineage', 'lineage_ucount' # count how many positions this corresponds to
#, 'lineage_set' pos_only = pd.DataFrame(gene_LF4['position'].unique())
, 'lineage_ulist' pos_only
, 'lineage_mode'
, 'lineage_list']]
#%%
#Subset relevant columns for output and put the rest of the output here print('Checking NA in snps...')# should be 0
if snps_only.mutationinformation.isna().sum() == 0:
print ('PASS: NO NAs/missing entries for SNPs'
, '\n===============================================================')
else:
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
# write file: mCSM muts
out_filename_mcsmsnps = gene.lower() + '_mcsm_formatted_snps.csv'
outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
print('\n----------------------------------'
, '\nWriting file: mCSM style muts'
, '\n----------------------------------'
, '\nFile:', outfile_mcsmsnps
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo. of distinct muts:', len(snps_only)
, '\nNo. of distinct positions:', len(pos_only)
, '\n=============================================================')
snps_only.to_csv(outfile_mcsmsnps, header = False, index = False)
print('Finished writing:', outfile_mcsmsnps
, '\nNo. of rows:', len(snps_only)
, '\nNo. of cols:', len(snps_only.columns)
, '\n=============================================================')
del(out_filename_mcsmsnps)
###############################################################################
#%% OUTFILE 4, write file frequency of position count
metadata_pos = pd.DataFrame(gene_LF4['position'])
metadata_pos['meta_pos_count'] = metadata_pos['position'].map(z1)
metadata_pos['meta_pos_count'].value_counts()
metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = True)
out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
print('\n----------------------------------'
, '\nWriting file: Metadata poscounts'
, '\n----------------------------------'
, '\nFile:', outfile_metadata_poscounts
, '\n============================================================')
metadata_pos.to_csv(outfile_metadata_poscounts, header = True, index = False)
print('Finished writing:', outfile_metadata_poscounts
, '\nNo. of rows:', len(metadata_pos)
, '\nNo. of cols:', len(metadata_pos.columns)
, '\n=============================================================')
del(out_filename_metadata_poscounts)
###############################################################################
#%% OUTFILE 5, write file <gene>_metadata
# where each row has UNIQUE mutations NOT unique sample ids
out_filename_metadata = gene.lower() + '_metadata.csv'
outfile_metadata = outdir + '/' + out_filename_metadata
print('\n----------------------------------'
, '\nWriting file: LF formatted data'
, '\n----------------------------------'
, '\nFile:', outfile_metadata
, '\n============================================================')
gene_LF4.to_csv(outfile_metadata, header = True, index = False)
print('Finished writing:', outfile_metadata
, '\nNo. of rows:', len(gene_LF4)
, '\nNo. of cols:', len(gene_LF4.columns)
, '\n=============================================================')
del(out_filename_metadata)
###############################################################################
#%% OUTFILE 6, write file MSA plots
#write file: mCSM style but with repitions for MSA and logo plots
all_muts_msa = pd.DataFrame(gene_LF4['mutationinformation'])
all_muts_msa.head()
# assign column name
all_muts_msa.columns = ['mutationinformation']
# make sure it is string
all_muts_msa.columns.dtype
# sort the column
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
# create an extra column with protein name
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
#all_muts_msa_sorted.head()
# rearrange columns so the fasta name is the first column (required for mutate.script)
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
all_muts_msa_sorted.head()
print('Checking NA in snps...')# should be 0
if all_muts_msa.mutationinformation.isna().sum() == 0:
print ('PASS: NO NAs/missing entries for SNPs'
, '\n===============================================================')
else:
sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
out_filename_msa = gene.lower() +'_all_muts_msa.csv'
outfile_msa = outdir + '/' + out_filename_msa
print('\n----------------------------------'
, '\nWriting file: mCSM style muts for msa'
, '\n----------------------------------'
, '\nFile:', outfile_msa
, '\nmutation format (SNP): {WT}<POS>{MUT}'
, '\nNo.of lines of msa:', len(all_muts_msa))
all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False)
print('Finished writing:', outfile_msa
, '\nNo. of rows:', len(all_muts_msa)
, '\nNo. of cols:', len(all_muts_msa.columns)
, '\n=============================================================')
del(out_filename_msa)
###############################################################################
#%% OUTFILE 7, write file mutational position with count
# write file for mutational positions
# count how many positions this corresponds to
pos_only = pd.DataFrame(gene_LF4['position'].unique())
# assign column name
pos_only.columns = ['position']
# make sure dtype of column position is int or numeric and not string
pos_only.position.dtype
pos_only['position'] = pos_only['position'].astype(int)
pos_only.position.dtype
# sort by position value
pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True)
out_filename_pos = gene.lower() + '_mutational_positons.csv'
outfile_pos = outdir + '/' + out_filename_pos
print('\n----------------------------------'
, '\nWriting file: mutational positions'
, '\n----------------------------------'
, '\nFile:', outfile_pos
, '\nNo. of distinct positions:', len(pos_only_sorted)
, '\n=============================================================')
pos_only_sorted.to_csv(outfile_pos, header = True, index = False)
print('Finished writing:', outfile_pos
, '\nNo. of rows:', len(pos_only_sorted)
, '\nNo. of cols:', len(pos_only_sorted.columns)
, '\n============================================================='
, '\n\n\n')
del(out_filename_pos)
###############################################################################
#%% Quick summary output
print('============================================'
, '\nQuick summary output for', drug, 'and' , gene.lower()
, '\n============================================'
, '\nTotal samples:', total_samples
, '\nNo. of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts()
, '\n'
, '\nPercentage of Sus and Res [original]', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100
, '\n'
, '\nNo. of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts()
, '\n'
, '\nPercentage of Sus and Res [Revised]', drug, 'samples:\n', gene_LF4['dst_mode'].value_counts(normalize = True)*100
, '\n'
, '\nTotal no. of unique snps:', len(snps_only)
, '\nTotal no. of unique snps:', dr_muts_rev.nunique()+other_muts_rev.nunique()
, '\nTotal no.of unique dr muts:', dr_muts_rev.nunique() # ADD
, '\nTotal no.of unique other muts:', other_muts_rev.nunique()#ADD
, '\nTotal no.of unique missense muts:', gene_LF4['mutationinformation'].nunique()
, '\nTotal no.of unique positions associated with missense muts:', gene_LF4['position'].nunique()
, '\nTotal no. of samples with missense muts:', len(gene_LF4)
, '\nTotal no. of unique samples with missense muts:', gene_LF4['id'].nunique()
, '\n')
if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
print('\nTotal no.of samples with ambiguous muts:', len(inspect)
#, '\nTotal no.of unique ambiguous muts:', len(common_muts)
, '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
, '\n============================================================='
, '\nPost resolving ambiguity\n'
, ambig_muts_rev_df['mutation_info_REV'].value_counts())
#=======================================================================
print(u'\u2698' * 50,
'\nEnd of script: Data extraction and writing files'
'\n' + u'\u2698' * 50 )
#%% end of script