diff --git a/scripts/mmcsm_provean_ed_combine_CHECKS.py b/scripts/mmcsm_provean_ed_combine_CHECKS.py deleted file mode 100644 index 8ef1999..0000000 --- a/scripts/mmcsm_provean_ed_combine_CHECKS.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Wed May 25 02:35:41 2022 - -@author: tanu -""" -import sys, os -import pandas as pd -from pandas import DataFrame -import numpy as np -import argparse -from functools import reduce -from sklearn.preprocessing import MinMaxScaler -#%% -mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/pyrazinamide/output/mmcsm_lig/single_muts/pnca_mmcsm_results.csv' - , index_col = False) -mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/cycloserine/output/mmcsm_lig/single_muts/alr_mmcsm_results.csv' - , index_col = False) -mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/ethambutol/output/mmcsm_lig/single_muts/embb_mmcsm_results.csv' - , index_col = False) -mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/streptomycin/output/mmcsm_lig/single_muts/gid_mmcsm_results.csv' - , index_col = False) -mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/isoniazid/output/mmcsm_lig/single_muts/katg_mmcsm_results.csv' - , index_col = False) -mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/rifampicin/output/mmcsm_lig/single_muts/rpob_mmcsm_results.csv' - , index_col = False) -mmcsm_lig_raw.columns -# drop 1st column -#mmcsm_lig = mmcsm_lig_raw.iloc[:,1:] - -# extract specific columns: might be simplers -mmcsm_lig_df = mmcsm_lig_raw[['MUTATION', 'CHAIN', 'DDG']] -mmcsm_lig_df['CHAIN'].value_counts() - -# Drop the chain column -mmcsm_lig_df.drop(['CHAIN'], axis = 1, inplace = True) - -# Rename columns using lower case and consistently to allow merge later on -mmcsm_lig_df.rename({'MUTATION': 'mutationinformation' - , 'DDG': 'mmcsm_lig'}, axis = 1, inplace = True) - -#---------------------------------------- -# Rescale values in mmcsm_lig_affinity -# col b/w -1 and 1 so negative numbers -# stay neg and pos numbers stay positive -#----------------------------------------- -mmcsm_lig_min = mmcsm_lig_df['mmcsm_lig'].min() -mmcsm_lig_max = mmcsm_lig_df['mmcsm_lig'].max() -print('\nmmcsm_lig (MIN):', mmcsm_lig_min - , '\nmmcsm_lig (MAX):', mmcsm_lig_max) - -# quick check -print('\nNo. of Stabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] >= 0])) -print('\nNo. of Destabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] < 0])) - -mmcsm_ligscale = lambda x : x/abs(mmcsm_lig_min) if x < 0 else (x/mmcsm_lig_max if x >= 0 else 'failed') - -mmcsm_lig_df['mmcsm_lig_scaled'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(mmcsm_ligscale) -print('\nRaw mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig'] - , '\n---------------------------------------------------------------' - , '\nScaled mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig_scaled']) - -print('\nmmCSM lig raw (Max):', mmcsm_lig_df['mmcsm_lig'].max() - , '\nmmCSM lig scaled (Max):', mmcsm_lig_df['mmcsm_lig_scaled'].max()) - -print('\nmmCSM lig raw (Min):', mmcsm_lig_df['mmcsm_lig'].min() - , '\nmmCSM lig scaled (Min):', mmcsm_lig_df['mmcsm_lig_scaled'].min()) - -mmcsm_lig_df['mmcsm_lig_scaled'].hist(bins = 30) -mmcsm_lig_df['mmcsm_lig'].hist(bins = 30) - -#----------------------------- -# mmCSM lig outcome category: -# -ve: Destabilising -# +ve: Stabilising -#---------------------------- -mmcsm_lig_df['mmcsm_lig_outcome'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(lambda x: 'Destabilising' if x < 0 else 'Stabilising') -mmcsm_lig_df[mmcsm_lig_df['mmcsm_lig']<0].count() - -############################################################################### -#%% Provean data -provean_df = pd.read_csv('/home/tanu/git/Data/pyrazinamide/output/provean/pnca_provean.csv' - , header = None) -provean_df = pd.read_csv('/home/tanu/git/Data/cycloserine/output/provean/alr_provean.csv' - , header = None) -provean_df = pd.read_csv('/home/tanu/git/Data/ethambutol/output/provean/embb_provean.csv' - , header = None) -provean_df = pd.read_csv('/home/tanu/git/Data/streptomycin/output/provean/gid_provean.csv' - , header = None) -provean_df = pd.read_csv('/home/tanu/git/Data/isoniazid/output/provean/katg_provean.csv' - , header = None) -provean_df = pd.read_csv('/home/tanu/git/Data/rifampicin/output/provean/rpob_provean.csv' - , header = None) - -provean_df.head() -provean_df.columns = ['mutationinformation', 'provean_score', 'provean_outcome'] -provean_df.head() -provean_df['provean_outcome'].value_counts() - -#---------------------------------------- -# Rescale values in provean_score -# col b/w -1 and 1 so negative numbers -# stay neg and pos numbers stay positive -# cut off =-2.5 -# so provean scores >= (-2.5) are neutral -# and provean scores < (2.5) are deleterious -#----------------------------------------- -provean_min = provean_df['provean_score'].min() -provean_max = provean_df['provean_score'].max() -print('\nprovean_score (MIN):', provean_min - , '\nprovean_score (MAX):', provean_max) - -# quick check -provean_cut_off = -2.5 - -if (provean_df['provean_score'] > provean_cut_off).sum() == provean_df['provean_outcome'].value_counts()['Neutral']: - print('\nPASS: Provean cut off is indeed:', provean_cut_off - , '\nNo. of values above', provean_cut_off, 'i.e classed as Neutral:' - , (provean_df['provean_score']>provean_cut_off).sum() - , '\nProvean outcome:' - , '\nNeutral:', len(provean_df.loc[provean_df['provean_score'] > provean_cut_off]) - , '\nDeleterious:', len(provean_df.loc[provean_df['provean_score'] < provean_cut_off]) - ) -else: - sys.exit('\nFAIL: Numbers mismatch. Please check provean cut off and condition used') - - -# RECHECK logic!: CANNOT use this as it changes the data distribution, as seen from the his plot -# provean_scale = lambda x : x/abs(provean_min) if x < 0 else (x/provean_min if x >= 0 else 'failed') -# provean_df['provean_scaled1'] = provean_df.loc[:,'provean_score'].apply(provean_scale) -# print('\nRaw provean scores:\n', provean_df['provean_score'] -# , '\n---------------------------------------------------------------' -# , '\nScaled provean scores:\n', provean_df['provean_scaled']) - - -# print('\nprovean raw (Max):' , provean_df['provean_score'].max() -# , '\nprovean scaled (Max):' , provean_df['provean_scaled1'].max()) - -# print('\nprovean raw (Min):' , provean_df['provean_score'].min() -# , '\nprovean scaled (Min):' , provean_df['provean_scaled1'].min()) - - -scaler = MinMaxScaler() -provean_df['provean_scaled'] = scaler.fit_transform(provean_df['provean_score'].values.reshape(-1,1)) - -provean_df['provean_score'].hist(bins = 30) -#provean_df['provean_scaled1'].hist(bins = 10) -provean_df['provean_scaled'].hist(bins = 30)