added mmcsm_lig and provean dfs merges in comnining_df.py
This commit is contained in:
parent
d8041fb494
commit
a2bcc3a732
2 changed files with 220 additions and 11 deletions
|
@ -23,6 +23,9 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
#12) <gene>.lower()'_mcsm_ppi2.csv'
|
||||
#13) <gene>.lower()'_consurf.csv'
|
||||
#14) <gene>.lower()'_snap2.csv'
|
||||
#15) <gene>.lower()'_provean.csv
|
||||
#16) <gene>.lower()'_mmcsm_lig_results.csv'
|
||||
#17) <gene>.lower()'_edXXX'!!!! TODO
|
||||
|
||||
# combining order
|
||||
|
||||
|
@ -43,6 +46,7 @@ from pandas import DataFrame
|
|||
import numpy as np
|
||||
import argparse
|
||||
from functools import reduce
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
|
@ -168,8 +172,7 @@ infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
|||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||
|
||||
# more output added
|
||||
## consurf [change colnames]
|
||||
|
||||
## consurf [change colnames]? [add outcome category]
|
||||
infilename_consurf = gene.lower() + '_consurf_grades_f.csv'
|
||||
infile_consurf = outdir + 'consurf/'+ infilename_consurf
|
||||
consurf_df = pd.read_csv(infile_consurf, sep = ',')
|
||||
|
@ -179,6 +182,16 @@ infilename_snap2 = gene.lower() + '_snap2_output.csv'
|
|||
infile_snap2 = outdir + 'snap2/'+ infilename_snap2
|
||||
snap2_df = pd.read_csv(infile_snap2, sep = ',')
|
||||
|
||||
## PROVEAN
|
||||
infilename_provean = gene.lower() + '_provean.csv'
|
||||
infile_provean = outdir + 'provean/'+ infilename_provean
|
||||
provean_df = pd.read_csv(infile_provean, sep = ',',header = None )
|
||||
|
||||
# mmCSM-lig
|
||||
infilename_mmcsm = gene.lower() + '_mmcsm_results.csv'
|
||||
infile_mmcsm = outdir + 'mmcsm_lig/single_muts/'+ infilename_mmcsm
|
||||
mmcsm_lig_raw = pd.read_csv(infile_mmcsm, sep = ',')
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
|
||||
geneL_na = ['gid', 'rpob']
|
||||
|
@ -738,6 +751,116 @@ else:
|
|||
, 'snap2_accuracy_pc'
|
||||
, 'snap2_outcome']]
|
||||
|
||||
#=======================
|
||||
# Provean
|
||||
#=======================
|
||||
provean_df.head()
|
||||
provean_df.columns = ['mutationinformation', 'provean_score', 'provean_outcome']
|
||||
provean_df.head()
|
||||
provean_df['provean_outcome'].value_counts()
|
||||
|
||||
#----------------------------------------
|
||||
# Rescale values in provean_score
|
||||
# col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
# cut off =-2.5
|
||||
# so provean scores >= (-2.5) are neutral
|
||||
# and provean scores < (2.5) are deleterious
|
||||
#-----------------------------------------
|
||||
provean_min = provean_df['provean_score'].min()
|
||||
provean_max = provean_df['provean_score'].max()
|
||||
print('\nprovean_score (MIN):', provean_min
|
||||
, '\nprovean_score (MAX):', provean_max)
|
||||
|
||||
# quick check
|
||||
provean_cut_off = -2.5
|
||||
|
||||
if (provean_df['provean_score'] > provean_cut_off).sum() == provean_df['provean_outcome'].value_counts()['Neutral']:
|
||||
print('\nPASS: Provean cut off is indeed:', provean_cut_off
|
||||
, '\nNo. of values above', provean_cut_off, 'i.e classed as Neutral:'
|
||||
, (provean_df['provean_score']>provean_cut_off).sum()
|
||||
, '\nProvean outcome:'
|
||||
, '\nNeutral:', len(provean_df.loc[provean_df['provean_score'] > provean_cut_off])
|
||||
, '\nDeleterious:', len(provean_df.loc[provean_df['provean_score'] < provean_cut_off])
|
||||
)
|
||||
else:
|
||||
sys.exit('\nFAIL: Numbers mismatch. Please check provean cut off and condition used')
|
||||
|
||||
# RECHECK logic!: CANNOT use this as it changes the data distribution, as seen from the his plot
|
||||
# provean_scale = lambda x : x/abs(provean_min) if x < 0 else (x/provean_min if x >= 0 else 'failed')
|
||||
# provean_df['provean_scaled1'] = provean_df.loc[:,'provean_score'].apply(provean_scale)
|
||||
# print('\nRaw provean scores:\n', provean_df['provean_score']
|
||||
# , '\n---------------------------------------------------------------'
|
||||
# , '\nScaled provean scores:\n', provean_df['provean_scaled'])
|
||||
|
||||
# print('\nprovean raw (Max):' , provean_df['provean_score'].max()
|
||||
# , '\nprovean scaled (Max):' , provean_df['provean_scaled1'].max())
|
||||
# print('\nprovean raw (Min):' , provean_df['provean_score'].min()
|
||||
# , '\nprovean scaled (Min):' , provean_df['provean_scaled1'].min())
|
||||
|
||||
scaler = MinMaxScaler()
|
||||
provean_df['provean_scaled'] = scaler.fit_transform(provean_df['provean_score'].values.reshape(-1,1))
|
||||
|
||||
provean_df['provean_score'].hist(bins = 30)
|
||||
#provean_df['provean_scaled1'].hist(bins = 10)
|
||||
provean_df['provean_scaled'].hist(bins = 30)
|
||||
|
||||
#=======================
|
||||
# mmCSM-lig
|
||||
#=======================
|
||||
mmcsm_lig_raw.columns
|
||||
|
||||
# extract specific columns: might be simpler
|
||||
mmcsm_lig_df = mmcsm_lig_raw[['MUTATION', 'CHAIN', 'DDG']]
|
||||
mmcsm_lig_df['CHAIN'].value_counts()
|
||||
|
||||
# Drop the chain column
|
||||
mmcsm_lig_df.drop(['CHAIN'], axis = 1, inplace = True)
|
||||
|
||||
# Rename columns using lower case and consistently to allow merge later on
|
||||
mmcsm_lig_df.rename({'MUTATION': 'mutationinformation'
|
||||
, 'DDG': 'mmcsm_lig'}, axis = 1, inplace = True)
|
||||
|
||||
#----------------------------------------
|
||||
# Rescale values in mmcsm_lig_affinity
|
||||
# col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
#-----------------------------------------
|
||||
mmcsm_lig_min = mmcsm_lig_df['mmcsm_lig'].min()
|
||||
mmcsm_lig_max = mmcsm_lig_df['mmcsm_lig'].max()
|
||||
print('\nmmcsm_lig (MIN):', mmcsm_lig_min
|
||||
, '\nmmcsm_lig (MAX):', mmcsm_lig_max)
|
||||
|
||||
# quick check
|
||||
print('\nNo. of Stabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] >= 0]))
|
||||
print('\nNo. of Destabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] < 0]))
|
||||
|
||||
mmcsm_ligscale = lambda x : x/abs(mmcsm_lig_min) if x < 0 else (x/mmcsm_lig_max if x >= 0 else 'failed')
|
||||
|
||||
mmcsm_lig_df['mmcsm_lig_scaled'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(mmcsm_ligscale)
|
||||
print('\nRaw mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig_scaled'])
|
||||
|
||||
print('\nmmCSM lig raw (Max):', mmcsm_lig_df['mmcsm_lig'].max()
|
||||
, '\nmmCSM lig scaled (Max):', mmcsm_lig_df['mmcsm_lig_scaled'].max())
|
||||
|
||||
print('\nmmCSM lig raw (Min):', mmcsm_lig_df['mmcsm_lig'].min()
|
||||
, '\nmmCSM lig scaled (Min):', mmcsm_lig_df['mmcsm_lig_scaled'].min())
|
||||
|
||||
mmcsm_lig_df['mmcsm_lig_scaled'].hist(bins = 30)
|
||||
mmcsm_lig_df['mmcsm_lig'].hist(bins = 30)
|
||||
|
||||
#-----------------------------
|
||||
# mmCSM lig outcome category:
|
||||
# -ve: Destabilising
|
||||
# +ve: Stabilising
|
||||
#----------------------------
|
||||
mmcsm_lig_df['mmcsm_lig_outcome'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(lambda x: 'Destabilising' if x < 0 else 'Stabilising')
|
||||
mmcsm_lig_df[mmcsm_lig_df['mmcsm_lig']<0].count()
|
||||
|
||||
del(mmcsm_lig_raw)
|
||||
|
||||
#%%============================================================================
|
||||
# Now merges begin
|
||||
print('==================================='
|
||||
|
@ -1201,9 +1324,47 @@ redundant_colsL = ['mutationinformation_snap2'
|
|||
combined_all_params_f = combined_all_params.drop(redundant_colsL
|
||||
, axis = 1
|
||||
, errors = 'ignore')
|
||||
#---------------------------------------
|
||||
# MERGE 7 [UQ]: provean and mmCSM-lig dfs
|
||||
#---------------------------------------
|
||||
if len(combined_all_params_f) == len(provean_df) == len(mmcsm_lig_df):
|
||||
print('\nPASS: length of Provean and mmCSM-lig df length match with combined_df.'
|
||||
, '\nProceeding with FINAL merging before writing file...')
|
||||
else:
|
||||
sys.exit('\nFAIL: Cannot do final merge! Check lengths of consurf, provean and combined_all_params_f dfs')
|
||||
|
||||
merging_cols_m7 = detect_common_cols(provean_df, mmcsm_lig_df)
|
||||
print('\nMering provean and mmcsm-lig dfs on:', merging_cols_m7)
|
||||
|
||||
pr_mm_df = pd.merge(provean_df
|
||||
, mmcsm_lig_df
|
||||
, on = merging_cols_m7)
|
||||
|
||||
#-----------------------------------------------
|
||||
# MERGE 8 [UQ]: combined_all_params_f + pr_mm_df
|
||||
#-----------------------------------------------
|
||||
merging_cols_m8 = detect_common_cols(combined_all_params_f, pr_mm_df)
|
||||
print('\nMering all combined_dfs + (pr_mm_df) on:', merging_cols_m8)
|
||||
|
||||
combined_all_params_f2 = pd.merge(combined_all_params_f
|
||||
, pr_mm_df
|
||||
, on = merging_cols_m8)
|
||||
|
||||
expected_ncols = len(combined_all_params_f.columns) + len(pr_mm_df.columns) - len(merging_cols_m8)
|
||||
expected_nrows = len(combined_all_params_f2)
|
||||
|
||||
if len(combined_all_params_f2.columns) == expected_ncols and len(combined_all_params_f2) == expected_nrows:
|
||||
print('\nPASS: All dfs combined including PROVEAN and mmCSM-lig')
|
||||
else:
|
||||
print('\nFAIL:lengths mismatch'
|
||||
, '\nExpected ncols:', expected_ncols
|
||||
, '\nGot:', len(combined_all_params_f2.columns)
|
||||
, '\nExpected nrows:', expected_nrows
|
||||
, '\nGot:', len(combined_all_params_f2) )
|
||||
#---------------------------------------
|
||||
# Add pdb_file name at the end
|
||||
combined_all_params_f['pdb_file'] = gene_pdb_f
|
||||
#---------------------------------------
|
||||
combined_all_params_f2['pdb_file'] = gene_pdb_f
|
||||
|
||||
#%%============================================================================
|
||||
#---------------------
|
||||
|
@ -1213,9 +1374,9 @@ print('\nWriting file: all params')
|
|||
print('\nOutput 3 filename:', outfile_comb
|
||||
, '\n===================================================================\n')
|
||||
|
||||
combined_all_params_f.to_csv(outfile_comb, index = False)
|
||||
combined_all_params_f2.to_csv(outfile_comb, index = False)
|
||||
|
||||
print('\nFinished writing file:'
|
||||
, '\nNo. of rows:', combined_all_params_f.shape[0]
|
||||
, '\nNo. of cols:', combined_all_params_f.shape[1])
|
||||
, '\nNo. of rows:', combined_all_params_f2.shape[0]
|
||||
, '\nNo. of cols:', combined_all_params_f2.shape[1])
|
||||
#%% end of script
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
# count numbers for ML
|
||||
|
||||
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||
source("~/git/LSHTM_analysis/config/embb.R")
|
||||
|
||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
#source("~/git/LSHTM_analysis/config/rpob.R")
|
||||
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
|
||||
gene
|
||||
gene_match
|
||||
|
||||
nrow(merged_df3)
|
||||
##############################################
|
||||
#=============
|
||||
|
@ -15,7 +19,7 @@ nrow(merged_df3)
|
|||
#==============
|
||||
table(merged_df3$mutation_info)
|
||||
sum(table(merged_df3$mutation_info))
|
||||
sum(table(merged_df3$mutation_info))
|
||||
table(merged_df3$mutation_info_orig)
|
||||
##############################################
|
||||
|
||||
#=============
|
||||
|
@ -64,3 +68,47 @@ sum(table(merged_df3$drtype_mode_labels))
|
|||
table(merged_df3$lineage)
|
||||
sum(table(merged_df3$lineage_labels))
|
||||
|
||||
# write file
|
||||
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||
outfile_merged_df3
|
||||
write.csv(merged_df3, outfile_merged_df3)
|
||||
|
||||
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||
outfile_merged_df2
|
||||
write.csv(merged_df2, outfile_merged_df2)
|
||||
|
||||
###################################################
|
||||
###################################################
|
||||
###################################################
|
||||
|
||||
source("~/git/LSHTM_analysis/config/alr.R")
|
||||
source("~/git/LSHTM_analysis/config/embb.R")
|
||||
source("~/git/LSHTM_analysis/config/gid.R")
|
||||
source("~/git/LSHTM_analysis/config/katg.R")
|
||||
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||
|
||||
df3_filename = paste0("/home/tanu/git/Data/", drug, "/output/", tolower(gene), "_merged_df3.csv")
|
||||
df3 = read.csv(df3_filename)
|
||||
|
||||
# mutationinformation
|
||||
length(unique((df3$mutationinformation)))
|
||||
|
||||
#dm _om
|
||||
table(df3$mutation_info)
|
||||
table(df3$mutation_info_labels)
|
||||
table(df3$mutation_info_orig)
|
||||
table(df3$mutation_info_labels_orig)
|
||||
|
||||
# test_set
|
||||
na_count <-sapply(df3, function(y) sum(length(which(is.na(y)))))
|
||||
na_count[drug]
|
||||
|
||||
# training set
|
||||
table(df3[drug])
|
||||
|
||||
# drtype: MDR and XDR
|
||||
#table(df3$drtype) orig i.e. incorrect ones!
|
||||
table(df3$drtype_mode_labels)
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue