added mmcsm_lig and provean dfs merges in comnining_df.py

This commit is contained in:
Tanushree Tunstall 2022-05-25 08:50:33 +01:00
parent d8041fb494
commit a2bcc3a732
2 changed files with 220 additions and 11 deletions

View file

@ -23,6 +23,9 @@ Created on Tue Aug 6 12:56:03 2019
#12) <gene>.lower()'_mcsm_ppi2.csv'
#13) <gene>.lower()'_consurf.csv'
#14) <gene>.lower()'_snap2.csv'
#15) <gene>.lower()'_provean.csv
#16) <gene>.lower()'_mmcsm_lig_results.csv'
#17) <gene>.lower()'_edXXX'!!!! TODO
# combining order
@ -43,6 +46,7 @@ from pandas import DataFrame
import numpy as np
import argparse
from functools import reduce
from sklearn.preprocessing import MinMaxScaler
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
@ -168,8 +172,7 @@ infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
# more output added
## consurf [change colnames]
## consurf [change colnames]? [add outcome category]
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
infile_consurf = outdir + 'consurf/'+ infilename_consurf
consurf_df = pd.read_csv(infile_consurf, sep = ',')
@ -179,6 +182,16 @@ infilename_snap2 = gene.lower() + '_snap2_output.csv'
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
snap2_df = pd.read_csv(infile_snap2, sep = ',')
## PROVEAN
infilename_provean = gene.lower() + '_provean.csv'
infile_provean = outdir + 'provean/'+ infilename_provean
provean_df = pd.read_csv(infile_provean, sep = ',',header = None )
# mmCSM-lig
infilename_mmcsm = gene.lower() + '_mmcsm_results.csv'
infile_mmcsm = outdir + 'mmcsm_lig/single_muts/'+ infilename_mmcsm
mmcsm_lig_raw = pd.read_csv(infile_mmcsm, sep = ',')
#------------------------------------------------------------------------------
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
geneL_na = ['gid', 'rpob']
@ -737,7 +750,117 @@ else:
, 'snap2_scaled'
, 'snap2_accuracy_pc'
, 'snap2_outcome']]
#=======================
# Provean
#=======================
provean_df.head()
provean_df.columns = ['mutationinformation', 'provean_score', 'provean_outcome']
provean_df.head()
provean_df['provean_outcome'].value_counts()
#----------------------------------------
# Rescale values in provean_score
# col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
# cut off =-2.5
# so provean scores >= (-2.5) are neutral
# and provean scores < (2.5) are deleterious
#-----------------------------------------
provean_min = provean_df['provean_score'].min()
provean_max = provean_df['provean_score'].max()
print('\nprovean_score (MIN):', provean_min
, '\nprovean_score (MAX):', provean_max)
# quick check
provean_cut_off = -2.5
if (provean_df['provean_score'] > provean_cut_off).sum() == provean_df['provean_outcome'].value_counts()['Neutral']:
print('\nPASS: Provean cut off is indeed:', provean_cut_off
, '\nNo. of values above', provean_cut_off, 'i.e classed as Neutral:'
, (provean_df['provean_score']>provean_cut_off).sum()
, '\nProvean outcome:'
, '\nNeutral:', len(provean_df.loc[provean_df['provean_score'] > provean_cut_off])
, '\nDeleterious:', len(provean_df.loc[provean_df['provean_score'] < provean_cut_off])
)
else:
sys.exit('\nFAIL: Numbers mismatch. Please check provean cut off and condition used')
# RECHECK logic!: CANNOT use this as it changes the data distribution, as seen from the his plot
# provean_scale = lambda x : x/abs(provean_min) if x < 0 else (x/provean_min if x >= 0 else 'failed')
# provean_df['provean_scaled1'] = provean_df.loc[:,'provean_score'].apply(provean_scale)
# print('\nRaw provean scores:\n', provean_df['provean_score']
# , '\n---------------------------------------------------------------'
# , '\nScaled provean scores:\n', provean_df['provean_scaled'])
# print('\nprovean raw (Max):' , provean_df['provean_score'].max()
# , '\nprovean scaled (Max):' , provean_df['provean_scaled1'].max())
# print('\nprovean raw (Min):' , provean_df['provean_score'].min()
# , '\nprovean scaled (Min):' , provean_df['provean_scaled1'].min())
scaler = MinMaxScaler()
provean_df['provean_scaled'] = scaler.fit_transform(provean_df['provean_score'].values.reshape(-1,1))
provean_df['provean_score'].hist(bins = 30)
#provean_df['provean_scaled1'].hist(bins = 10)
provean_df['provean_scaled'].hist(bins = 30)
#=======================
# mmCSM-lig
#=======================
mmcsm_lig_raw.columns
# extract specific columns: might be simpler
mmcsm_lig_df = mmcsm_lig_raw[['MUTATION', 'CHAIN', 'DDG']]
mmcsm_lig_df['CHAIN'].value_counts()
# Drop the chain column
mmcsm_lig_df.drop(['CHAIN'], axis = 1, inplace = True)
# Rename columns using lower case and consistently to allow merge later on
mmcsm_lig_df.rename({'MUTATION': 'mutationinformation'
, 'DDG': 'mmcsm_lig'}, axis = 1, inplace = True)
#----------------------------------------
# Rescale values in mmcsm_lig_affinity
# col b/w -1 and 1 so negative numbers
# stay neg and pos numbers stay positive
#-----------------------------------------
mmcsm_lig_min = mmcsm_lig_df['mmcsm_lig'].min()
mmcsm_lig_max = mmcsm_lig_df['mmcsm_lig'].max()
print('\nmmcsm_lig (MIN):', mmcsm_lig_min
, '\nmmcsm_lig (MAX):', mmcsm_lig_max)
# quick check
print('\nNo. of Stabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] >= 0]))
print('\nNo. of Destabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] < 0]))
mmcsm_ligscale = lambda x : x/abs(mmcsm_lig_min) if x < 0 else (x/mmcsm_lig_max if x >= 0 else 'failed')
mmcsm_lig_df['mmcsm_lig_scaled'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(mmcsm_ligscale)
print('\nRaw mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig']
, '\n---------------------------------------------------------------'
, '\nScaled mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig_scaled'])
print('\nmmCSM lig raw (Max):', mmcsm_lig_df['mmcsm_lig'].max()
, '\nmmCSM lig scaled (Max):', mmcsm_lig_df['mmcsm_lig_scaled'].max())
print('\nmmCSM lig raw (Min):', mmcsm_lig_df['mmcsm_lig'].min()
, '\nmmCSM lig scaled (Min):', mmcsm_lig_df['mmcsm_lig_scaled'].min())
mmcsm_lig_df['mmcsm_lig_scaled'].hist(bins = 30)
mmcsm_lig_df['mmcsm_lig'].hist(bins = 30)
#-----------------------------
# mmCSM lig outcome category:
# -ve: Destabilising
# +ve: Stabilising
#----------------------------
mmcsm_lig_df['mmcsm_lig_outcome'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(lambda x: 'Destabilising' if x < 0 else 'Stabilising')
mmcsm_lig_df[mmcsm_lig_df['mmcsm_lig']<0].count()
del(mmcsm_lig_raw)
#%%============================================================================
# Now merges begin
print('==================================='
@ -1201,9 +1324,47 @@ redundant_colsL = ['mutationinformation_snap2'
combined_all_params_f = combined_all_params.drop(redundant_colsL
, axis = 1
, errors = 'ignore')
#---------------------------------------
# MERGE 7 [UQ]: provean and mmCSM-lig dfs
#---------------------------------------
if len(combined_all_params_f) == len(provean_df) == len(mmcsm_lig_df):
print('\nPASS: length of Provean and mmCSM-lig df length match with combined_df.'
, '\nProceeding with FINAL merging before writing file...')
else:
sys.exit('\nFAIL: Cannot do final merge! Check lengths of consurf, provean and combined_all_params_f dfs')
merging_cols_m7 = detect_common_cols(provean_df, mmcsm_lig_df)
print('\nMering provean and mmcsm-lig dfs on:', merging_cols_m7)
pr_mm_df = pd.merge(provean_df
, mmcsm_lig_df
, on = merging_cols_m7)
#-----------------------------------------------
# MERGE 8 [UQ]: combined_all_params_f + pr_mm_df
#-----------------------------------------------
merging_cols_m8 = detect_common_cols(combined_all_params_f, pr_mm_df)
print('\nMering all combined_dfs + (pr_mm_df) on:', merging_cols_m8)
combined_all_params_f2 = pd.merge(combined_all_params_f
, pr_mm_df
, on = merging_cols_m8)
expected_ncols = len(combined_all_params_f.columns) + len(pr_mm_df.columns) - len(merging_cols_m8)
expected_nrows = len(combined_all_params_f2)
if len(combined_all_params_f2.columns) == expected_ncols and len(combined_all_params_f2) == expected_nrows:
print('\nPASS: All dfs combined including PROVEAN and mmCSM-lig')
else:
print('\nFAIL:lengths mismatch'
, '\nExpected ncols:', expected_ncols
, '\nGot:', len(combined_all_params_f2.columns)
, '\nExpected nrows:', expected_nrows
, '\nGot:', len(combined_all_params_f2) )
#---------------------------------------
# Add pdb_file name at the end
combined_all_params_f['pdb_file'] = gene_pdb_f
#---------------------------------------
combined_all_params_f2['pdb_file'] = gene_pdb_f
#%%============================================================================
#---------------------
@ -1213,9 +1374,9 @@ print('\nWriting file: all params')
print('\nOutput 3 filename:', outfile_comb
, '\n===================================================================\n')
combined_all_params_f.to_csv(outfile_comb, index = False)
combined_all_params_f2.to_csv(outfile_comb, index = False)
print('\nFinished writing file:'
, '\nNo. of rows:', combined_all_params_f.shape[0]
, '\nNo. of cols:', combined_all_params_f.shape[1])
, '\nNo. of rows:', combined_all_params_f2.shape[0]
, '\nNo. of cols:', combined_all_params_f2.shape[1])
#%% end of script

View file

@ -1,13 +1,17 @@
# count numbers for ML
#source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/embb.R")
#source("~/git/LSHTM_analysis/config/gid.R")
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/katg.R")
source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/config/rpob.R")
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
gene
gene_match
nrow(merged_df3)
##############################################
#=============
@ -15,7 +19,7 @@ nrow(merged_df3)
#==============
table(merged_df3$mutation_info)
sum(table(merged_df3$mutation_info))
sum(table(merged_df3$mutation_info))
table(merged_df3$mutation_info_orig)
##############################################
#=============
@ -64,3 +68,47 @@ sum(table(merged_df3$drtype_mode_labels))
table(merged_df3$lineage)
sum(table(merged_df3$lineage_labels))
# write file
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
outfile_merged_df3
write.csv(merged_df3, outfile_merged_df3)
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
outfile_merged_df2
write.csv(merged_df2, outfile_merged_df2)
###################################################
###################################################
###################################################
source("~/git/LSHTM_analysis/config/alr.R")
source("~/git/LSHTM_analysis/config/embb.R")
source("~/git/LSHTM_analysis/config/gid.R")
source("~/git/LSHTM_analysis/config/katg.R")
source("~/git/LSHTM_analysis/config/pnca.R")
source("~/git/LSHTM_analysis/config/rpob.R")
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
df3 = read.csv(df3_filename)
# mutationinformation
length(unique((df3$mutationinformation)))
#dm _om
table(df3$mutation_info)
table(df3$mutation_info_labels)
table(df3$mutation_info_orig)
table(df3$mutation_info_labels_orig)
# test_set
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
na_count[drug]
# training set
table(df3[drug])
# drtype: MDR and XDR
#table(df3$drtype) orig i.e. incorrect ones!
table(df3$drtype_mode_labels)