added script for formatting mcsm_na results
This commit is contained in:
parent
50cf6ca3ac
commit
93482df47a
2 changed files with 183 additions and 0 deletions
135
mcsm_na/format_results_mcsm_na.py
Normal file
135
mcsm_na/format_results_mcsm_na.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
|
||||
def format_mcsm_na_output(mcsm_na_output_tsv):
|
||||
"""
|
||||
@param mcsm_na_na_outputcsv: file containing mcsm_na_na results for all muts
|
||||
which is the result of combining all mcsm_na_na batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
This is post run_get_results_mcsm_na_na.py
|
||||
Formatting df to a pandas df and output as tsv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted mcsm_na_na output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
mcsm_na_data_raw = pd.read_csv(mcsm_na_output_tsv, sep = '\t')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = mcsm_na_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase and consistent colnames
|
||||
mcsm_na_data.columns
|
||||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
|
||||
, 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
|
||||
, 'WILD_RES': 'wild_type' # one letter amino acid code
|
||||
, 'RES_POS': 'position' # number
|
||||
, 'MUT_RES': 'mutant_type' # one letter amino acid code
|
||||
, 'RSA': 'rsa' # single letter (caps)
|
||||
, 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code
|
||||
|
||||
mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
mcsm_na_data.columns
|
||||
|
||||
#%%============================================================================
|
||||
#############
|
||||
# create mutationinformation column
|
||||
#############
|
||||
mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# Create col: mcsm_na_outcome
|
||||
#############
|
||||
# classification based on mcsm_na_affinity values
|
||||
print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
|
||||
print('Sanity check:')
|
||||
# count positive values in the mcsm_na_affinity column
|
||||
c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
|
||||
mcsm_na_pos = c.get(key = 'mcsm_na_affinity')
|
||||
|
||||
# Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
|
||||
mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
|
||||
print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())
|
||||
|
||||
#if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
|
||||
# print('PASS: mcsm_na_outcome assigned correctly')
|
||||
#else:
|
||||
# print('FAIL: mcsm_na_outcome assigned incorrectly'
|
||||
# , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
|
||||
# , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
|
||||
# , '\n======================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# scale mcsm_na values
|
||||
#############
|
||||
# Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min()
|
||||
mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max()
|
||||
|
||||
mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
|
||||
print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])
|
||||
|
||||
c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
|
||||
mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')
|
||||
|
||||
if mcsm_na_pos == mcsm_na_pos2:
|
||||
print('\nPASS: Affinity values scaled correctly')
|
||||
else:
|
||||
print('\nFAIL: Affinity values scaled numbers MISmatch'
|
||||
, '\nExpected number:', mcsm_na_pos
|
||||
, '\nGot:', mcsm_na_pos2
|
||||
, '\n======================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
mcsm_na_data.columns
|
||||
mcsm_na_data = mcsm_na_data[['mutationinformation'
|
||||
, 'mcsm_na_affinity'
|
||||
, 'mcsm_na_scaled'
|
||||
, 'mcsm_na_outcome'
|
||||
, 'rsa'
|
||||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'chain'
|
||||
, 'pdb_file']]
|
||||
return(mcsm_na_data)
|
||||
#%%#####################################################################
|
||||
|
48
mcsm_na/run_format_results_mcsm_na.py
Normal file
48
mcsm_na/run_format_results_mcsm_na.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
|
||||
from format_results_mcsm_na import *
|
||||
########################################################################
|
||||
# variables
|
||||
|
||||
# TODO: add cmd line args
|
||||
|
||||
#gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
datadir = homedir + '/git/Data'
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
outdir_na = outdir + '/mcsm_na_results/'
|
||||
|
||||
# Input file
|
||||
infile_mcsm_na = outdir_na + 'gid_output_combined_clean.tsv'
|
||||
|
||||
# Formatted output file
|
||||
outfile_mcsm_na_f = outdir_na + 'gid_complex_mcsm_na_norm.csv'
|
||||
|
||||
#==========================
|
||||
# CALL: format_results_mcsm_na()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('Formatting results for:', infile_mcsm_na)
|
||||
mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_na_df_f.to_csv(outfile_mcsm_na_f, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_mcsm_na_f
|
||||
, '\nExpected no. of rows:', len(mcsm_na_df_f)
|
||||
, '\nExpected no. of cols:', len(mcsm_na_df_f.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
#%%#####################################################################
|
Loading…
Add table
Add a link
Reference in a new issue