210 lines
No EOL
8.4 KiB
Python
Executable file
210 lines
No EOL
8.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Aug 19 14:33:51 2020
|
|
|
|
@author: tanu
|
|
"""
|
|
#%% load packages
|
|
import os,sys
|
|
homedir = os.path.expanduser('~')
|
|
import subprocess
|
|
import argparse
|
|
import requests
|
|
import re
|
|
import time
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import numpy as np
|
|
from pandas.api.types import is_string_dtype
|
|
from pandas.api.types import is_numeric_dtype
|
|
|
|
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
|
from reference_dict import up_3letter_aa_dict
|
|
from reference_dict import oneletter_aa_dict
|
|
#%%============================================================================
|
|
|
|
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv, gene_name):
|
|
"""
|
|
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
|
|
which is the result of combining all mcsm_ppi2 batch results, and using
|
|
bash scripts to combine all the batch results into one file.
|
|
Formatting df to a pandas df and output as csv.
|
|
@type string
|
|
|
|
@return (not true) formatted csv for mcsm_ppi2 output
|
|
@type pandas df
|
|
|
|
"""
|
|
#############
|
|
# Read file
|
|
#############
|
|
mcsm_ppi2_data_raw = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')
|
|
|
|
# strip white space from both ends in all columns
|
|
mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
|
|
|
dforig_shape = mcsm_ppi2_data.shape
|
|
print('dimensions of input file:', dforig_shape)
|
|
|
|
#############
|
|
# Map 3 letter
|
|
# code to one
|
|
#############
|
|
# initialise a sub dict that is lookup dict for
|
|
# 3-LETTER aa code to 1-LETTER aa code
|
|
lookup_dict = dict()
|
|
for k, v in up_3letter_aa_dict.items():
|
|
lookup_dict[k] = v['one_letter_code']
|
|
wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
|
|
mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)
|
|
mut = mcsm_ppi2_data['mutant'].squeeze()
|
|
mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
|
|
|
|
# #############
|
|
# # CHECK
|
|
# # Map 1 letter
|
|
# # code to 3Upper
|
|
# #############
|
|
# # initialise a sub dict that is lookup dict for
|
|
# # 3-LETTER aa code to 1-LETTER aa code
|
|
# lookup_dict = dict()
|
|
# for k, v in oneletter_aa_dict.items():
|
|
# lookup_dict[k] = v['three_letter_code_upper']
|
|
# wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
|
|
# mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)
|
|
# mut = mcsm_ppi2_data['m_type'].squeeze()
|
|
# mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
|
|
|
|
# # check
|
|
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
|
|
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
|
|
#%%=====================================================================
|
|
# add offset specified position number for rpob since 5uhc with chain 'C' was
|
|
# used to run the analysis
|
|
|
|
geneL_sp = ['rpob']
|
|
if gene_name.lower() in geneL_sp:
|
|
offset = 6
|
|
chain_orig = 'A'
|
|
|
|
# Add offset corrected position number. matching with rpob nsSNPs used for mCSM-lig
|
|
# and also add corresponding chain id matching with rpob nsSNPs used for mCSM-lig
|
|
mcsm_ppi2_data['position'] = mcsm_ppi2_data['res-number'] - offset
|
|
mcsm_ppi2_data['chain'] = chain_orig
|
|
mcsm_ppi2_data['5uhc_offset'] = offset
|
|
|
|
#############
|
|
# rename cols
|
|
#############
|
|
# format colnames: all lowercase and consistent colnames
|
|
mcsm_ppi2_data.columns
|
|
print('Assigning meaningful colnames'
|
|
, '\n=======================================================')
|
|
|
|
my_colnames_dict = {'chain' : 'chain'
|
|
, 'position' : 'position'
|
|
, '5uhc_offset' : '5uhc_offset'
|
|
, 'wild-type' : 'wt_upper'
|
|
, 'res-number' : '5uhc_position'
|
|
, 'mutant' : 'mut_upper'
|
|
, 'distance-to-interface': 'interface_dist'
|
|
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
|
|
, 'affinity' : 'mcsm_ppi2_outcome'
|
|
, 'w_type' : 'wild_type' # one letter amino acid code
|
|
, 'm_type' : 'mutant_type' # one letter amino acid code
|
|
}
|
|
else:
|
|
my_colnames_dict = {'chain' : 'chain'
|
|
, 'wild-type' : 'wt_upper'
|
|
, 'res-number' : 'position'
|
|
, 'mutant' : 'mut_upper'
|
|
, 'distance-to-interface': 'interface_dist'
|
|
, 'mcsm-ppi2-prediction' : 'mcsm_ppi2_affinity'
|
|
, 'affinity' : 'mcsm_ppi2_outcome'
|
|
, 'w_type' : 'wild_type' # one letter amino acid code
|
|
, 'm_type' : 'mutant_type' # one letter amino acid code
|
|
}
|
|
#%%==============================================================================
|
|
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
|
|
mcsm_ppi2_data.columns
|
|
|
|
#############
|
|
# create mutationinformation column
|
|
#############
|
|
#mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
|
|
mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
|
|
|
|
#%%=====================================================================
|
|
#########################
|
|
# scale mcsm_ppi2 values
|
|
#########################
|
|
# Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
|
|
# stay neg and pos numbers stay positive
|
|
mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min()
|
|
mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max()
|
|
|
|
mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
|
|
|
|
mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
|
|
print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
|
|
, '\n---------------------------------------------------------------'
|
|
, '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
|
|
|
|
c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
|
|
mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
|
|
|
|
c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
|
|
mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
|
|
|
|
if mcsm_ppi2_pos == mcsm_ppi2_pos2:
|
|
print('\nPASS: Affinity values scaled correctly')
|
|
else:
|
|
print('\nFAIL: Affinity values scaled numbers MISmatch'
|
|
, '\nExpected number:', mcsm_ppi2_pos
|
|
, '\nGot:', mcsm_ppi2_pos2
|
|
, '\n======================================================')
|
|
#%%=====================================================================
|
|
###################
|
|
# reorder columns
|
|
###################
|
|
mcsm_ppi2_data.columns
|
|
|
|
#---------------------
|
|
# Determine col order
|
|
#---------------------
|
|
|
|
core_cols = ['mutationinformation'
|
|
, 'mcsm_ppi2_affinity'
|
|
, 'mcsm_ppi2_scaled'
|
|
, 'mcsm_ppi2_outcome'
|
|
, 'interface_dist'
|
|
, 'wild_type'
|
|
, 'position'
|
|
, 'mutant_type'
|
|
, 'wt_upper'
|
|
, 'mut_upper'
|
|
, 'chain']
|
|
|
|
if gene_name.lower() in geneL_sp:
|
|
|
|
column_order = core_cols + ['5uhc_offset', '5uhc_position']
|
|
|
|
else:
|
|
|
|
column_order = core_cols.copy()
|
|
|
|
#--------------
|
|
# reorder now
|
|
#--------------
|
|
mcsm_ppi2_dataf = mcsm_ppi2_data[column_order]
|
|
|
|
#%%============================================================================
|
|
###################
|
|
# Sort df based on
|
|
# position columns
|
|
###################
|
|
mcsm_ppi2_dataf.sort_values(by = ['position', 'mutant_type'], inplace = True, ascending = True)
|
|
|
|
return(mcsm_ppi2_dataf)
|
|
#%%##################################################################### |