moved mmcsm_provean_ed_combine_CHECKS.py to scratch for when I need ED data for merging
This commit is contained in:
parent
1baf7fa9f0
commit
f41cd0082e
1 changed files with 0 additions and 149 deletions
|
@ -1,149 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed May 25 02:35:41 2022
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
import sys, os
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import numpy as np
|
||||
import argparse
|
||||
from functools import reduce
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
#%%
|
||||
mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/pyrazinamide/output/mmcsm_lig/single_muts/pnca_mmcsm_results.csv'
|
||||
, index_col = False)
|
||||
mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/cycloserine/output/mmcsm_lig/single_muts/alr_mmcsm_results.csv'
|
||||
, index_col = False)
|
||||
mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/ethambutol/output/mmcsm_lig/single_muts/embb_mmcsm_results.csv'
|
||||
, index_col = False)
|
||||
mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/streptomycin/output/mmcsm_lig/single_muts/gid_mmcsm_results.csv'
|
||||
, index_col = False)
|
||||
mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/isoniazid/output/mmcsm_lig/single_muts/katg_mmcsm_results.csv'
|
||||
, index_col = False)
|
||||
mmcsm_lig_raw = pd.read_csv('/home/tanu/git/Data/rifampicin/output/mmcsm_lig/single_muts/rpob_mmcsm_results.csv'
|
||||
, index_col = False)
|
||||
mmcsm_lig_raw.columns
|
||||
# drop 1st column
|
||||
#mmcsm_lig = mmcsm_lig_raw.iloc[:,1:]
|
||||
|
||||
# extract specific columns: might be simplers
|
||||
mmcsm_lig_df = mmcsm_lig_raw[['MUTATION', 'CHAIN', 'DDG']]
|
||||
mmcsm_lig_df['CHAIN'].value_counts()
|
||||
|
||||
# Drop the chain column
|
||||
mmcsm_lig_df.drop(['CHAIN'], axis = 1, inplace = True)
|
||||
|
||||
# Rename columns using lower case and consistently to allow merge later on
|
||||
mmcsm_lig_df.rename({'MUTATION': 'mutationinformation'
|
||||
, 'DDG': 'mmcsm_lig'}, axis = 1, inplace = True)
|
||||
|
||||
#----------------------------------------
|
||||
# Rescale values in mmcsm_lig_affinity
|
||||
# col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
#-----------------------------------------
|
||||
mmcsm_lig_min = mmcsm_lig_df['mmcsm_lig'].min()
|
||||
mmcsm_lig_max = mmcsm_lig_df['mmcsm_lig'].max()
|
||||
print('\nmmcsm_lig (MIN):', mmcsm_lig_min
|
||||
, '\nmmcsm_lig (MAX):', mmcsm_lig_max)
|
||||
|
||||
# quick check
|
||||
print('\nNo. of Stabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] >= 0]))
|
||||
print('\nNo. of Destabilising mmCSM mutations:', len(mmcsm_lig_df.loc[mmcsm_lig_df['mmcsm_lig'] < 0]))
|
||||
|
||||
mmcsm_ligscale = lambda x : x/abs(mmcsm_lig_min) if x < 0 else (x/mmcsm_lig_max if x >= 0 else 'failed')
|
||||
|
||||
mmcsm_lig_df['mmcsm_lig_scaled'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(mmcsm_ligscale)
|
||||
print('\nRaw mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled mmcsm_lig scores:\n', mmcsm_lig_df['mmcsm_lig_scaled'])
|
||||
|
||||
print('\nmmCSM lig raw (Max):', mmcsm_lig_df['mmcsm_lig'].max()
|
||||
, '\nmmCSM lig scaled (Max):', mmcsm_lig_df['mmcsm_lig_scaled'].max())
|
||||
|
||||
print('\nmmCSM lig raw (Min):', mmcsm_lig_df['mmcsm_lig'].min()
|
||||
, '\nmmCSM lig scaled (Min):', mmcsm_lig_df['mmcsm_lig_scaled'].min())
|
||||
|
||||
mmcsm_lig_df['mmcsm_lig_scaled'].hist(bins = 30)
|
||||
mmcsm_lig_df['mmcsm_lig'].hist(bins = 30)
|
||||
|
||||
#-----------------------------
|
||||
# mmCSM lig outcome category:
|
||||
# -ve: Destabilising
|
||||
# +ve: Stabilising
|
||||
#----------------------------
|
||||
mmcsm_lig_df['mmcsm_lig_outcome'] = mmcsm_lig_df.loc[:,'mmcsm_lig'].apply(lambda x: 'Destabilising' if x < 0 else 'Stabilising')
|
||||
mmcsm_lig_df[mmcsm_lig_df['mmcsm_lig']<0].count()
|
||||
|
||||
###############################################################################
|
||||
#%% Provean data
|
||||
provean_df = pd.read_csv('/home/tanu/git/Data/pyrazinamide/output/provean/pnca_provean.csv'
|
||||
, header = None)
|
||||
provean_df = pd.read_csv('/home/tanu/git/Data/cycloserine/output/provean/alr_provean.csv'
|
||||
, header = None)
|
||||
provean_df = pd.read_csv('/home/tanu/git/Data/ethambutol/output/provean/embb_provean.csv'
|
||||
, header = None)
|
||||
provean_df = pd.read_csv('/home/tanu/git/Data/streptomycin/output/provean/gid_provean.csv'
|
||||
, header = None)
|
||||
provean_df = pd.read_csv('/home/tanu/git/Data/isoniazid/output/provean/katg_provean.csv'
|
||||
, header = None)
|
||||
provean_df = pd.read_csv('/home/tanu/git/Data/rifampicin/output/provean/rpob_provean.csv'
|
||||
, header = None)
|
||||
|
||||
provean_df.head()
|
||||
provean_df.columns = ['mutationinformation', 'provean_score', 'provean_outcome']
|
||||
provean_df.head()
|
||||
provean_df['provean_outcome'].value_counts()
|
||||
|
||||
#----------------------------------------
|
||||
# Rescale values in provean_score
|
||||
# col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
# cut off =-2.5
|
||||
# so provean scores >= (-2.5) are neutral
|
||||
# and provean scores < (2.5) are deleterious
|
||||
#-----------------------------------------
|
||||
provean_min = provean_df['provean_score'].min()
|
||||
provean_max = provean_df['provean_score'].max()
|
||||
print('\nprovean_score (MIN):', provean_min
|
||||
, '\nprovean_score (MAX):', provean_max)
|
||||
|
||||
# quick check
|
||||
provean_cut_off = -2.5
|
||||
|
||||
if (provean_df['provean_score'] > provean_cut_off).sum() == provean_df['provean_outcome'].value_counts()['Neutral']:
|
||||
print('\nPASS: Provean cut off is indeed:', provean_cut_off
|
||||
, '\nNo. of values above', provean_cut_off, 'i.e classed as Neutral:'
|
||||
, (provean_df['provean_score']>provean_cut_off).sum()
|
||||
, '\nProvean outcome:'
|
||||
, '\nNeutral:', len(provean_df.loc[provean_df['provean_score'] > provean_cut_off])
|
||||
, '\nDeleterious:', len(provean_df.loc[provean_df['provean_score'] < provean_cut_off])
|
||||
)
|
||||
else:
|
||||
sys.exit('\nFAIL: Numbers mismatch. Please check provean cut off and condition used')
|
||||
|
||||
|
||||
# RECHECK logic!: CANNOT use this as it changes the data distribution, as seen from the his plot
|
||||
# provean_scale = lambda x : x/abs(provean_min) if x < 0 else (x/provean_min if x >= 0 else 'failed')
|
||||
# provean_df['provean_scaled1'] = provean_df.loc[:,'provean_score'].apply(provean_scale)
|
||||
# print('\nRaw provean scores:\n', provean_df['provean_score']
|
||||
# , '\n---------------------------------------------------------------'
|
||||
# , '\nScaled provean scores:\n', provean_df['provean_scaled'])
|
||||
|
||||
|
||||
# print('\nprovean raw (Max):' , provean_df['provean_score'].max()
|
||||
# , '\nprovean scaled (Max):' , provean_df['provean_scaled1'].max())
|
||||
|
||||
# print('\nprovean raw (Min):' , provean_df['provean_score'].min()
|
||||
# , '\nprovean scaled (Min):' , provean_df['provean_scaled1'].min())
|
||||
|
||||
|
||||
scaler = MinMaxScaler()
|
||||
provean_df['provean_scaled'] = scaler.fit_transform(provean_df['provean_score'].values.reshape(-1,1))
|
||||
|
||||
provean_df['provean_score'].hist(bins = 30)
|
||||
#provean_df['provean_scaled1'].hist(bins = 10)
|
||||
provean_df['provean_scaled'].hist(bins = 30)
|
Loading…
Add table
Add a link
Reference in a new issue