bring in embb stuff which was in the wrong branch
This commit is contained in:
parent
0e44958585
commit
3368e949e8
6 changed files with 816 additions and 98 deletions
159
mcsm_ppi2/format_results_mcsm_ppi2.py
Executable file
159
mcsm_ppi2/format_results_mcsm_ppi2.py
Executable file
|
@ -0,0 +1,159 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
homedir = os.path.expanduser('~')
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
||||
from reference_dict import up_3letter_aa_dict
|
||||
from reference_dict import oneletter_aa_dict
|
||||
|
||||
#%%#####################################################################
|
||||
|
||||
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
||||
"""
|
||||
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all muts
|
||||
which is the result of combining all mcsm_ppi2 batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
Formatting df to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted csv for mcsm_ppi2 output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
mcsm_ppi2_data_raw = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = mcsm_ppi2_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#############
|
||||
# Map 3 letter
|
||||
# code to one
|
||||
#############
|
||||
# initialise a sub dict that is lookup dict for
|
||||
# 3-LETTER aa code to 1-LETTER aa code
|
||||
lookup_dict = dict()
|
||||
for k, v in up_3letter_aa_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
|
||||
mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)
|
||||
mut = mcsm_ppi2_data['mutant'].squeeze()
|
||||
mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
|
||||
|
||||
# #############
|
||||
# # CHECK
|
||||
# # Map 1 letter
|
||||
# # code to 3Upper
|
||||
# #############
|
||||
# # initialise a sub dict that is lookup dict for
|
||||
# # 3-LETTER aa code to 1-LETTER aa code
|
||||
# lookup_dict = dict()
|
||||
# for k, v in oneletter_aa_dict.items():
|
||||
# lookup_dict[k] = v['three_letter_code_upper']
|
||||
# wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
|
||||
# mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)
|
||||
# mut = mcsm_ppi2_data['m_type'].squeeze()
|
||||
# mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
|
||||
|
||||
# # check
|
||||
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
|
||||
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
|
||||
#%%============================================================================
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase and consistent colnames
|
||||
mcsm_ppi2_data.columns
|
||||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
|
||||
my_colnames_dict = {'chain': 'chain'
|
||||
, 'wild-type': 'wt_upper'
|
||||
, 'res-number': 'position'
|
||||
, 'mutant': 'mut_upper'
|
||||
, 'distance-to-interface': 'interface_dist'
|
||||
, 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
|
||||
, 'affinity': 'mcsm_ppi2_outcome'
|
||||
, 'w_type': 'wild_type' # one letter amino acid code
|
||||
, 'm_type': 'mutant_type' # one letter amino acid code
|
||||
}
|
||||
|
||||
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
mcsm_ppi2_data.columns
|
||||
|
||||
#############
|
||||
# create mutationinformation column
|
||||
#############
|
||||
#mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
|
||||
mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
|
||||
|
||||
#%%=====================================================================
|
||||
#########################
|
||||
# scale mcsm_ppi2 values
|
||||
#########################
|
||||
# Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min()
|
||||
mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max()
|
||||
|
||||
mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
|
||||
print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
|
||||
|
||||
c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
|
||||
mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
|
||||
|
||||
c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
|
||||
mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
|
||||
|
||||
if mcsm_ppi2_pos == mcsm_ppi2_pos2:
|
||||
print('\nPASS: Affinity values scaled correctly')
|
||||
else:
|
||||
print('\nFAIL: Affinity values scaled numbers MISmatch'
|
||||
, '\nExpected number:', mcsm_ppi2_pos
|
||||
, '\nGot:', mcsm_ppi2_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
mcsm_ppi2_data.columns
|
||||
mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
|
||||
, 'mcsm_ppi2_affinity'
|
||||
, 'mcsm_ppi2_scaled'
|
||||
, 'mcsm_ppi2_outcome'
|
||||
, 'interface_dist'
|
||||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'wt_upper'
|
||||
, 'mut_upper'
|
||||
, 'chain']]
|
||||
return(mcsm_ppi2_dataf)
|
||||
#%%#####################################################################
|
79
mcsm_ppi2/run_format_results_mcsm_ppi2.py
Executable file
79
mcsm_ppi2/run_format_results_mcsm_ppi2.py
Executable file
|
@ -0,0 +1,79 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import sys, os
|
||||
homedir = os.path.expanduser('~')
|
||||
#sys.path.append(homedir + '/git/LSHTM_analysis/mcsm_ppi2')
|
||||
|
||||
from format_results_mcsm_ppi2 import *
|
||||
########################################################################
|
||||
# TODO: add cmd line args
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
||||
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
||||
|
||||
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#%%============================================================================
|
||||
# variable assignment: input and output paths & filenames
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
#outdir_ppi2 = args.mkdir_name
|
||||
make_dirs = args.make_dirs
|
||||
|
||||
#=======
|
||||
# dirs
|
||||
#=======
|
||||
if not datadir:
|
||||
datadir = homedir + '/git/Data/'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + drug + '/input/'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
||||
#if not mkdir_name:
|
||||
# outdir_ppi2 = outdir + 'mcsm_ppi2/'
|
||||
|
||||
outdir_ppi2 = outdir + 'mcsm_ppi2/'
|
||||
|
||||
# Input file
|
||||
infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv'
|
||||
|
||||
# Formatted output file
|
||||
outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||
|
||||
#==========================
|
||||
# CALL: format_results_mcsm_na()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('Formatting results for:', infile_mcsm_ppi2)
|
||||
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_ppi2_df_f.to_csv(outfile_mcsm_ppi2_f, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_mcsm_ppi2_f
|
||||
, '\nExpected no. of rows:', len(mcsm_ppi2_df_f)
|
||||
, '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
#%%#####################################################################
|
Loading…
Add table
Add a link
Reference in a new issue