added mmcsm_lig and provean dfs merges in comnining_df.py
This commit is contained in:
parent
d8041fb494
commit
a2bcc3a732
2 changed files with 220 additions and 11 deletions
|
@ -23,6 +23,9 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
#12) <gene>.lower()'_mcsm_ppi2.csv'
|
#12) <gene>.lower()'_mcsm_ppi2.csv'
|
||||||
#13) <gene>.lower()'_consurf.csv'
|
#13) <gene>.lower()'_consurf.csv'
|
||||||
#14) <gene>.lower()'_snap2.csv'
|
#14) <gene>.lower()'_snap2.csv'
|
||||||
|
#15) <gene>.lower()'_provean.csv
|
||||||
|
#16) <gene>.lower()'_mmcsm_lig_results.csv'
|
||||||
|
#17) <gene>.lower()'_edXXX'!!!! TODO
|
||||||
|
|
||||||
# combining order
|
# combining order
|
||||||
|
|
||||||
|
@ -43,6 +46,7 @@ from pandas import DataFrame
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import argparse
|
import argparse
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% specify input and curr dir
|
#%% specify input and curr dir
|
||||||
homedir = os.path.expanduser('~')
|
homedir = os.path.expanduser('~')
|
||||||
|
@ -168,8 +172,7 @@ infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||||
|
|
||||||
# more output added
|
# more output added
|
||||||
## consurf [change colnames]
|
## consurf [change colnames]? [add outcome category]
|
||||||
|
|
||||||
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
|
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
|
||||||
infile_consurf = outdir + 'consurf/'+ infilename_consurf
|
infile_consurf = outdir + 'consurf/'+ infilename_consurf
|
||||||
consurf_df = pd.read_csv(infile_consurf, sep = ',')
|
consurf_df = pd.read_csv(infile_consurf, sep = ',')
|
||||||
|
@ -179,6 +182,16 @@ infilename_snap2 = gene.lower() + '_snap2_output.csv'
|
||||||
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
||||||
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||||
|
|
||||||
|
## PROVEAN
|
||||||
|
infilename_provean = gene.lower() + '_provean.csv'
|
||||||
|
infile_provean = outdir + 'provean/'+ infilename_provean
|
||||||
|
provean_df = pd.read_csv(infile_provean, sep = ',',header = None )
|
||||||
|
|
||||||
|
# mmCSM-lig
|
||||||
|
infilename_mmcsm = gene.lower() + '_mmcsm_results.csv'
|
||||||
|
infile_mmcsm = outdir + 'mmcsm_lig/single_muts/'+ infilename_mmcsm
|
||||||
|
mmcsm_lig_raw = pd.read_csv(infile_mmcsm, sep = ',')
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
|
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
|
||||||
geneL_na = ['gid', 'rpob']
|
geneL_na = ['gid', 'rpob']
|
||||||
|
@ -738,6 +751,116 @@ else:
|
||||||
, 'snap2_accuracy_pc'
|
, 'snap2_accuracy_pc'
|
||||||
, 'snap2_outcome']]
|
, 'snap2_outcome']]
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# Provean
|
||||||
|
#=======================
|
||||||
|
provean_df.head()
|
||||||
|
provean_df.columns = ['mutationinformation', 'provean_score', 'provean_outcome']
|
||||||
|
provean_df.head()
|
||||||
|
provean_df['provean_outcome'].value_counts()
|
||||||
|
|
||||||
|
#----------------------------------------
|
||||||
|
# Rescale values in provean_score
|
||||||
|
# col b/w -1 and 1 so negative numbers
|
||||||
|
# stay neg and pos numbers stay positive
|
||||||
|
# cut off =-2.5
|
||||||
|
# so provean scores >= (-2.5) are neutral
|
||||||
|
# and provean scores < (2.5) are deleterious
|
||||||
|
#-----------------------------------------
|
||||||
|
provean_min = provean_df['provean_score'].min()
|
||||||
|
provean_max = provean_df['provean_score'].max()
|
||||||
|
print('\nprovean_score (MIN):', provean_min
|
||||||
|
, '\nprovean_score (MAX):', provean_max)
|
||||||
|
|
||||||
|
# quick check
|
||||||
|
provean_cut_off = -2.5
|
||||||
|
|
||||||
|
if (provean_df['provean_score'] > provean_cut_off).sum() == provean_df['provean_outcome'].value_counts()['Neutral']:
|
||||||
|
print('\nPASS: Provean cut off is indeed:', provean_cut_off
|
||||||
|
, '\nNo. of values above', provean_cut_off, 'i.e classed as Neutral:'
|
||||||
|
, (provean_df['provean_score']>provean_cut_off).sum()
|
||||||
|
, '\nProvean outcome:'
|
||||||
|
, '\nNeutral:', len(provean_df.loc[provean_df['provean_score'] > provean_cut_off])
|
||||||
|
, '\nDeleterious:', len(provean_df.loc[provean_df['provean_score'] < provean_cut_off])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: Numbers mismatch. Please check provean cut off and condition used')
|
||||||
|
|
||||||
|
# RECHECK logic!: CANNOT use this as it changes the data distribution, as seen from the his plot
|
||||||
|
# provean_scale = lambda x : x/abs(provean_min) if x < 0 else (x/provean_min if x >= 0 else 'failed')
|
||||||
|
# provean_df['provean_scaled1'] = provean_df.loc[:,'provean_score'].apply(provean_scale)
|
||||||
|
# print('\nRaw provean scores:\n', provean_df['provean_score']
|
||||||
|
# , '\n---------------------------------------------------------------'
|
||||||
|
# , '\nScaled provean scores:\n', provean_df['provean_scaled'])
|
||||||
|
|
||||||
|
# print('\nprovean raw (Max):' , provean_df['provean_score'].max()
|
||||||
|
# , '\nprovean scaled (Max):' , provean_df['provean_scaled1'].max())
|
||||||
|
# print('\nprovean raw (Min):' , provean_df['provean_score'].min()
|
||||||
|
# , '\nprovean scaled (Min):' , provean_df['provean_scaled1'].min())
|
||||||
|
|
||||||
|
scaler = MinMaxScaler()
|
||||||
|
provean_df['provean_scaled'] = scaler.fit_transform(provean_df['provean_score'].values.reshape(-1,1))
|
||||||
|
|
||||||
|
provean_df['provean_score'].hist(bins = 30)
|
||||||
|
#provean_df['provean_scaled1'].hist(bins = 10)
|
||||||
|
provean_df['provean_scaled'].hist(bins = 30)
|
||||||
|
|
||||||
|
#=======================
|
||||||
|
# mmCSM-lig
|
||||||
|
#=======================
|
||||||
|
mmcsm_lig_raw.columns
|
||||||
|
|
||||||
|
# extract specific columns: might be simpler
|
||||||
|
mmcsm_lig_df = mmcsm_lig_raw[['MUTATION', 'CHAIN', 'DDG']]
|
||||||
|
mmcsm_lig_df['CHAIN'].value_counts()
|
||||||
|
|
||||||
|
# Drop the chain column
|
||||||
|
mmcsm_lig_df.drop(['CHAIN'], axis = 1, inplace = True)
|
||||||
|
|
||||||
|
# Rename columns using lower case and consistently to allow merge later on
|
||||||
|
mmcsm_lig_df.rename({'MUTATION': 'mutationinformation'
|
||||||
|
, 'DDG': 'mmcsm_lig'}, axis = 1, inplace = True)
|
||||||
|
|
||||||
|
#----------------------------------------
|
||||||
|
# Rescale values in mmcsm_lig_affinity
|
||||||
|
# col b/w -1 and 1 so negative numbers
|
||||||
|
# stay neg and pos numbers stay positive
|
||||||
|
#-----------------------------------------
|
||||||
|
mmcsm_lig_min = mmcsm_lig_df['mmcsm_lig'].min()
|
||||||
|
mmcsm_lig_max = mmcsm_lig_df['mmcsm_lig'].max()
|
||||||
|
print('\nmmcsm_lig (MIN):', mmcsm_lig_min
|
||||||
|
, '\nmmcsm_lig (MAX):', mmcsm_lig_max)
|
||||||
|
|
||||||
|
# quick check
|
||||||
|
print('\nNo. of Stabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] >= 0]))
|
||||||
|
print('\nNo. of Destabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] < 0]))
|
||||||
|
|
||||||
|
mmcsm_ligscale = lambda x : x/abs(mmcsm_lig_min) if x < 0 else (x/mmcsm_lig_max if x >= 0 else 'failed')
|
||||||
|
|
||||||
|
mmcsm_lig_df['mmcsm_lig_scaled'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(mmcsm_ligscale)
|
||||||
|
print('\nRaw mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig']
|
||||||
|
, '\n---------------------------------------------------------------'
|
||||||
|
, '\nScaled mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig_scaled'])
|
||||||
|
|
||||||
|
print('\nmmCSM lig raw (Max):', mmcsm_lig_df['mmcsm_lig'].max()
|
||||||
|
, '\nmmCSM lig scaled (Max):', mmcsm_lig_df['mmcsm_lig_scaled'].max())
|
||||||
|
|
||||||
|
print('\nmmCSM lig raw (Min):', mmcsm_lig_df['mmcsm_lig'].min()
|
||||||
|
, '\nmmCSM lig scaled (Min):', mmcsm_lig_df['mmcsm_lig_scaled'].min())
|
||||||
|
|
||||||
|
mmcsm_lig_df['mmcsm_lig_scaled'].hist(bins = 30)
|
||||||
|
mmcsm_lig_df['mmcsm_lig'].hist(bins = 30)
|
||||||
|
|
||||||
|
#-----------------------------
|
||||||
|
# mmCSM lig outcome category:
|
||||||
|
# -ve: Destabilising
|
||||||
|
# +ve: Stabilising
|
||||||
|
#----------------------------
|
||||||
|
mmcsm_lig_df['mmcsm_lig_outcome'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(lambda x: 'Destabilising' if x < 0 else 'Stabilising')
|
||||||
|
mmcsm_lig_df[mmcsm_lig_df['mmcsm_lig']<0].count()
|
||||||
|
|
||||||
|
del(mmcsm_lig_raw)
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Now merges begin
|
# Now merges begin
|
||||||
print('==================================='
|
print('==================================='
|
||||||
|
@ -1201,9 +1324,47 @@ redundant_colsL = ['mutationinformation_snap2'
|
||||||
combined_all_params_f = combined_all_params.drop(redundant_colsL
|
combined_all_params_f = combined_all_params.drop(redundant_colsL
|
||||||
, axis = 1
|
, axis = 1
|
||||||
, errors = 'ignore')
|
, errors = 'ignore')
|
||||||
|
#---------------------------------------
|
||||||
|
# MERGE 7 [UQ]: provean and mmCSM-lig dfs
|
||||||
|
#---------------------------------------
|
||||||
|
if len(combined_all_params_f) == len(provean_df) == len(mmcsm_lig_df):
|
||||||
|
print('\nPASS: length of Provean and mmCSM-lig df length match with combined_df.'
|
||||||
|
, '\nProceeding with FINAL merging before writing file...')
|
||||||
|
else:
|
||||||
|
sys.exit('\nFAIL: Cannot do final merge! Check lengths of consurf, provean and combined_all_params_f dfs')
|
||||||
|
|
||||||
|
merging_cols_m7 = detect_common_cols(provean_df, mmcsm_lig_df)
|
||||||
|
print('\nMering provean and mmcsm-lig dfs on:', merging_cols_m7)
|
||||||
|
|
||||||
|
pr_mm_df = pd.merge(provean_df
|
||||||
|
, mmcsm_lig_df
|
||||||
|
, on = merging_cols_m7)
|
||||||
|
|
||||||
|
#-----------------------------------------------
|
||||||
|
# MERGE 8 [UQ]: combined_all_params_f + pr_mm_df
|
||||||
|
#-----------------------------------------------
|
||||||
|
merging_cols_m8 = detect_common_cols(combined_all_params_f, pr_mm_df)
|
||||||
|
print('\nMering all combined_dfs + (pr_mm_df) on:', merging_cols_m8)
|
||||||
|
|
||||||
|
combined_all_params_f2 = pd.merge(combined_all_params_f
|
||||||
|
, pr_mm_df
|
||||||
|
, on = merging_cols_m8)
|
||||||
|
|
||||||
|
expected_ncols = len(combined_all_params_f.columns) + len(pr_mm_df.columns) - len(merging_cols_m8)
|
||||||
|
expected_nrows = len(combined_all_params_f2)
|
||||||
|
|
||||||
|
if len(combined_all_params_f2.columns) == expected_ncols and len(combined_all_params_f2) == expected_nrows:
|
||||||
|
print('\nPASS: All dfs combined including PROVEAN and mmCSM-lig')
|
||||||
|
else:
|
||||||
|
print('\nFAIL:lengths mismatch'
|
||||||
|
, '\nExpected ncols:', expected_ncols
|
||||||
|
, '\nGot:', len(combined_all_params_f2.columns)
|
||||||
|
, '\nExpected nrows:', expected_nrows
|
||||||
|
, '\nGot:', len(combined_all_params_f2) )
|
||||||
|
#---------------------------------------
|
||||||
# Add pdb_file name at the end
|
# Add pdb_file name at the end
|
||||||
combined_all_params_f['pdb_file'] = gene_pdb_f
|
#---------------------------------------
|
||||||
|
combined_all_params_f2['pdb_file'] = gene_pdb_f
|
||||||
|
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
#---------------------
|
#---------------------
|
||||||
|
@ -1213,9 +1374,9 @@ print('\nWriting file: all params')
|
||||||
print('\nOutput 3 filename:', outfile_comb
|
print('\nOutput 3 filename:', outfile_comb
|
||||||
, '\n===================================================================\n')
|
, '\n===================================================================\n')
|
||||||
|
|
||||||
combined_all_params_f.to_csv(outfile_comb, index = False)
|
combined_all_params_f2.to_csv(outfile_comb, index = False)
|
||||||
|
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_all_params_f.shape[0]
|
, '\nNo. of rows:', combined_all_params_f2.shape[0]
|
||||||
, '\nNo. of cols:', combined_all_params_f.shape[1])
|
, '\nNo. of cols:', combined_all_params_f2.shape[1])
|
||||||
#%% end of script
|
#%% end of script
|
||||||
|
|
|
@ -1,13 +1,17 @@
|
||||||
# count numbers for ML
|
# count numbers for ML
|
||||||
|
|
||||||
#source("~/git/LSHTM_analysis/config/alr.R")
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
source("~/git/LSHTM_analysis/config/embb.R")
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
|
||||||
#source("~/git/LSHTM_analysis/config/gid.R")
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
#source("~/git/LSHTM_analysis/config/pnca.R")
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
|
||||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
|
gene
|
||||||
|
gene_match
|
||||||
|
|
||||||
nrow(merged_df3)
|
nrow(merged_df3)
|
||||||
##############################################
|
##############################################
|
||||||
#=============
|
#=============
|
||||||
|
@ -15,7 +19,7 @@ nrow(merged_df3)
|
||||||
#==============
|
#==============
|
||||||
table(merged_df3$mutation_info)
|
table(merged_df3$mutation_info)
|
||||||
sum(table(merged_df3$mutation_info))
|
sum(table(merged_df3$mutation_info))
|
||||||
sum(table(merged_df3$mutation_info))
|
table(merged_df3$mutation_info_orig)
|
||||||
##############################################
|
##############################################
|
||||||
|
|
||||||
#=============
|
#=============
|
||||||
|
@ -64,3 +68,47 @@ sum(table(merged_df3$drtype_mode_labels))
|
||||||
table(merged_df3$lineage)
|
table(merged_df3$lineage)
|
||||||
sum(table(merged_df3$lineage_labels))
|
sum(table(merged_df3$lineage_labels))
|
||||||
|
|
||||||
|
# write file
|
||||||
|
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||||
|
outfile_merged_df3
|
||||||
|
write.csv(merged_df3, outfile_merged_df3)
|
||||||
|
|
||||||
|
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||||
|
outfile_merged_df2
|
||||||
|
write.csv(merged_df2, outfile_merged_df2)
|
||||||
|
|
||||||
|
###################################################
|
||||||
|
###################################################
|
||||||
|
###################################################
|
||||||
|
|
||||||
|
source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
|
||||||
|
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
||||||
|
df3 = read.csv(df3_filename)
|
||||||
|
|
||||||
|
# mutationinformation
|
||||||
|
length(unique((df3$mutationinformation)))
|
||||||
|
|
||||||
|
#dm _om
|
||||||
|
table(df3$mutation_info)
|
||||||
|
table(df3$mutation_info_labels)
|
||||||
|
table(df3$mutation_info_orig)
|
||||||
|
table(df3$mutation_info_labels_orig)
|
||||||
|
|
||||||
|
# test_set
|
||||||
|
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
||||||
|
na_count[drug]
|
||||||
|
|
||||||
|
# training set
|
||||||
|
table(df3[drug])
|
||||||
|
|
||||||
|
# drtype: MDR and XDR
|
||||||
|
#table(df3$drtype) orig i.e. incorrect ones!
|
||||||
|
table(df3$drtype_mode_labels)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue