Compare commits
475 commits
Author | SHA1 | Date | |
---|---|---|---|
727ca1ee76 | |||
6550be3350 | |||
7fd5e2710d | |||
69b8ba9d08 | |||
45c48485f1 | |||
1ddc5045d5 | |||
34ee2519d3 | |||
246cd636a1 | |||
80f73a3697 | |||
7d6087c82e | |||
9c37dbee31 | |||
1e3670f935 | |||
0c16937b68 | |||
c0c30fd527 | |||
9cb33ed67b | |||
067fc85163 | |||
9b1d1d009d | |||
1ea42097ae | |||
2e9d142184 | |||
2eee69ee80 | |||
938dba7fcc | |||
59a370b45a | |||
3086972480 | |||
a641347f63 | |||
e48f215227 | |||
96277d78f6 | |||
7c0824d0f2 | |||
656639e871 | |||
718f92d7ff | |||
6759649c61 | |||
5eb07cdf86 | |||
4bf4650c88 | |||
a6f0832a42 | |||
b679068a5e | |||
c599d28377 | |||
9f5b983bc0 | |||
89e6b03673 | |||
a9f9cec494 | |||
29d9717abe | |||
20976c31bb | |||
0f983d2889 | |||
b614962e45 | |||
639ccf1cd7 | |||
f1a8fb583a | |||
e75cfd2665 | |||
71d874e350 | |||
5eba273a55 | |||
506e639a7b | |||
762b1a3931 | |||
e822f9f690 | |||
1c27bbff11 | |||
8f4daba98d | |||
7e6affea84 | |||
2aec79af31 | |||
c6d1260f74 | |||
13c61e7813 | |||
ac383165ec | |||
04a7cf15dc | |||
e10ab6a7c6 | |||
064182d784 | |||
920007cc83 | |||
8a301e8bb1 | |||
9534fc57d4 | |||
f79aea254e | |||
f6a2e029cb | |||
86ed1805fc | |||
ddb1a7a7aa | |||
57e4d8cd1e | |||
81ab3fe5ba | |||
ca1a0e10ca | |||
687adf0ec7 | |||
8fa9faa17d | |||
f88e2665e9 | |||
7686aa39b4 | |||
931f8ec2f9 | |||
b6df47a0cd | |||
acda9f13e5 | |||
e78707067c | |||
a2431b59e5 | |||
f6259aa517 | |||
0c3645705d | |||
dccdfe9742 | |||
5c018e23be | |||
4bee48f545 | |||
786eaabe1a | |||
225360fb93 | |||
3f58a5c64c | |||
776c4e0279 | |||
d45a9499a2 | |||
6f24fc1fac | |||
ce8abafdfe | |||
b25511a239 | |||
b8d0bc416a | |||
d21605b31f | |||
4f60e93abb | |||
7242b3516b | |||
d52534f676 | |||
18af246c24 | |||
8009c3fe3d | |||
c59e3f178d | |||
bbec97b00c | |||
9062751790 | |||
77efd0b76d | |||
88dad2696f | |||
34c0b808ea | |||
05562399ce | |||
9f03e6a6fd | |||
2995299179 | |||
f9249d7bf2 | |||
d683e971d4 | |||
8dc3a790c0 | |||
69b62e54a5 | |||
cfdd18086a | |||
9a0e98eb24 | |||
2168007f12 | |||
19d89230f5 | |||
a9a4483aee | |||
cd06a83e13 | |||
013bba2503 | |||
b69d9d729a | |||
7a74fecbda | |||
322979406c | |||
1f72001689 | |||
c99f1cac92 | |||
b2397ea99d | |||
9c221e6786 | |||
7f75b92553 | |||
56f5479c0b | |||
80f7e039ab | |||
4e19961283 | |||
7116b45bf8 | |||
28521104f8 | |||
1d8e6f0d75 | |||
2e047fd548 | |||
5d6ddb7639 | |||
cfe9028a9c | |||
2eab17cb9e | |||
d159a81cfb | |||
fad1526ce5 | |||
0fd3e75ab0 | |||
600f829972 | |||
d139342074 | |||
491b317752 | |||
98287b3c20 | |||
ab7bed9f4b | |||
56ca9db40d | |||
5e735af323 | |||
0c95b3a512 | |||
bcf4467c44 | |||
64018cce4c | |||
6b6921d45f | |||
534a6754cd | |||
4163ede798 | |||
8302d01867 | |||
725e9b53ca | |||
56150ae3c8 | |||
ca68996264 | |||
86670bbac3 | |||
9df3913a84 | |||
99b77434b5 | |||
fa25a30dcf | |||
1f8cfc2403 | |||
7a9b16255a | |||
08ad16adbb | |||
fc4313045f | |||
20bba2ad70 | |||
802522d1c6 | |||
ac5b86a9cd | |||
2ac4ea8f5c | |||
ccdd6029be | |||
f9fd74812a | |||
b0b9e91af7 | |||
b2284f7216 | |||
1f9ea3f789 | |||
59911687c8 | |||
2f1f02e1de | |||
667804ad83 | |||
7f5ca7f5a4 | |||
69f3629cc0 | |||
be50636b15 | |||
4285bbd59f | |||
18b6407539 | |||
9784cba232 | |||
e60b4c5492 | |||
9d2d6cfd84 | |||
a549e52825 | |||
5f441d09d9 | |||
f240c969ec | |||
07104a8c8e | |||
74c4ef16ae | |||
4c345ea9f4 | |||
9597997741 | |||
8a6c7968f5 | |||
a77b472dfa | |||
d2093e7a4c | |||
81796df71a | |||
c58fa8cd4d | |||
48050752db | |||
a3aab4556a | |||
6d08b646fc | |||
5579e9527b | |||
|
f7280ceada | ||
|
807876d919 | ||
|
baedea8c5b | ||
|
0eca5cf859 | ||
|
ac3c8a8086 | ||
5ceea2e7b7 | |||
63fa0c596a | |||
7239ab220b | |||
2297617af2 | |||
be8fa7e639 | |||
7e8d5c869e | |||
edabe0d776 | |||
771995d1ab | |||
093ae0d832 | |||
369c906a33 | |||
24b1cc2440 | |||
5e1c920a0c | |||
b8575c6e69 | |||
40e4ddd70a | |||
8ddca4a8b1 | |||
883207bc4b | |||
ea5d5bda44 | |||
f0ee1ff6c9 | |||
1b5280145b | |||
fb0646373b | |||
5f335a5051 | |||
63e04ae600 | |||
375cdc2068 | |||
a5b03e53e8 | |||
351e472e73 | |||
b36bfc9e9d | |||
25f2f9e4a2 | |||
ba02107e23 | |||
0f6bf3875d | |||
83deb64e1c | |||
445f3e2047 | |||
44d1f64e88 | |||
645827570f | |||
ee69445f11 | |||
09e20cf7b3 | |||
3612ef0f2d | |||
a5fdf01d25 | |||
e1da853cf1 | |||
968b57105f | |||
431e606448 | |||
fadd61bf57 | |||
7e4be21575 | |||
8d9ede186c | |||
ecbc7541e9 | |||
1262df40c9 | |||
078644c322 | |||
c124f49041 | |||
26d0d7f42d | |||
c1041ad273 | |||
e690f5beba | |||
c4225cec4f | |||
d4e75d5f64 | |||
8be1418a32 | |||
6934faca10 | |||
5102bbea1b | |||
f415b0b239 | |||
cf732a3bcc | |||
65841e4f5b | |||
68050a93b4 | |||
fdecc944fc | |||
d43ecfa1dc | |||
1708194912 | |||
fc47c58f91 | |||
9bee97052e | |||
f3f86d6651 | |||
2c2c2c1a60 | |||
f85b1bd902 | |||
e570454cf2 | |||
5025e47983 | |||
f424f4e2d6 | |||
080cd6375d | |||
19a984f228 | |||
31b98fb3d3 | |||
774b34ef00 | |||
09e4f7bfbd | |||
b7c7ffc018 | |||
46b43cf261 | |||
eb5491aad9 | |||
42986bb119 | |||
fe49a45447 | |||
5d9561f88a | |||
648be02665 | |||
b4affa0c94 | |||
2ef767f046 | |||
db87f98d32 | |||
7460c7c97f | |||
dd1158a66c | |||
645868ea27 | |||
ddefcd7841 | |||
bba3487829 | |||
3f8d6695a4 | |||
0220960975 | |||
89e881b5d4 | |||
0e3f9e584b | |||
482eeadb9a | |||
ed739aeb71 | |||
b754f26f9b | |||
73877942f4 | |||
75273cebbf | |||
54f9fd073b | |||
d76345c3de | |||
f468554427 | |||
a448d9276b | |||
d78626048c | |||
acd0b8355b | |||
841d18d10b | |||
48773a19ef | |||
f8f33abad8 | |||
2d8cb01cb7 | |||
dcd9a985ec | |||
13203e6fe0 | |||
61e41f1697 | |||
efe0178f4e | |||
7d1ecbb660 | |||
5e1b39cea0 | |||
1f44f8ec0a | |||
1e785a08a1 | |||
3cb33df009 | |||
55f03bc343 | |||
e41fb78e37 | |||
e4270b67c8 | |||
2bc5be20b9 | |||
7d36e0e36b | |||
46b1505fdf | |||
83383b4493 | |||
9e8469abe3 | |||
57a966c7c4 | |||
f9500d5324 | |||
5677175423 | |||
c80faef0bf | |||
aaf3f5e084 | |||
d3d82623d2 | |||
e4a7deae7b | |||
0379d3e241 | |||
91348aaae2 | |||
f8e345f5bc | |||
6402990154 | |||
01fbc2a87b | |||
0e71b23759 | |||
1fa0dc6ad4 | |||
c958cc1081 | |||
a4670b9944 | |||
a7f21cfb14 | |||
943513a338 | |||
5addb85851 | |||
a220288c5f | |||
262bd79204 | |||
90cbb49560 | |||
f758c01159 | |||
4d686e2933 | |||
af65a86ff9 | |||
3c6122a296 | |||
b82cc11dbe | |||
626ed3a57b | |||
a298071309 | |||
003b22ce3f | |||
a1cc7ee33d | |||
1e43ca8136 | |||
18998092f4 | |||
8f272bdc17 | |||
ada205962b | |||
0c3c6fd143 | |||
3497d1ef54 | |||
fa2bcb5f05 | |||
76ecb65a1a | |||
6c2c7e0a90 | |||
b33419c939 | |||
010ef133dd | |||
fdba990b80 | |||
8d1daabff4 | |||
e21635fe02 | |||
e2f319ba42 | |||
f6fc6e47ab | |||
3fe1d35df5 | |||
ca36e004c1 | |||
15dea0cbf6 | |||
548d9a5192 | |||
f7e371a585 | |||
01a7cbf26e | |||
65db4a090e | |||
3425d8fa2b | |||
7f66d5d19e | |||
b28d866237 | |||
a405aa17c3 | |||
e94da61871 | |||
e50466da39 | |||
7aafa72e10 | |||
45889990e7 | |||
7d2241ad81 | |||
398eccd246 | |||
f5241048b4 | |||
0550cfe0e2 | |||
7cee9b21e2 | |||
7a8bbc6595 | |||
fe3d431a3d | |||
c025a22343 | |||
30aa64fd2b | |||
49a38dd1ae | |||
569b7c6c7f | |||
811027e34d | |||
02488ea23e | |||
6afe202931 | |||
44577b4a0c | |||
24c7ade7c4 | |||
f690c75ca0 | |||
d161fcd0f3 | |||
b0e56328ef | |||
cc9cdbcad5 | |||
b5aa524914 | |||
34a2057d29 | |||
b1e4dcd376 | |||
e7f2a3aada | |||
ab541aa3de | |||
d1da203df0 | |||
82e96fcdba | |||
afd6ca8881 | |||
69e2567ffc | |||
c0bac6fd7b | |||
5bab99c15f | |||
0b7a938fbd | |||
4c2fa2b600 | |||
87a847109a | |||
de1822f491 | |||
96ebb85069 | |||
c184841951 | |||
dd91692673 | |||
22a0d38563 | |||
d42e6fbdb3 | |||
b4dbad7e54 | |||
b331227023 | |||
eb021349fe | |||
8df0b7d920 | |||
77cc5bf42c | |||
95e8205189 | |||
f9837b474c | |||
e9a95e9d3a | |||
ed8fc4d488 | |||
d7ef8ef51e | |||
b56c0b8b68 | |||
4ef68bdc1b | |||
b97712edb0 | |||
9e4b3c5dce | |||
0653a8c1e3 | |||
d12ef0ef00 | |||
d9519b6262 | |||
134dea609d | |||
8c7c389562 | |||
632b78320a | |||
c15d1a8a95 | |||
3390f80168 | |||
1d80186ab9 | |||
15daa6dfc1 | |||
ac34de9e79 | |||
f1584bddb1 | |||
6cbef0c3d7 | |||
1edfe3f8f8 | |||
|
8d2456f7f2 | ||
15391a5700 | |||
c3c50f65f2 | |||
4d2d03f634 | |||
bcf822d6e4 | |||
4f06e42ee4 | |||
4bcb81e9be | |||
be213cb7e9 | |||
cae9c550a4 | |||
2df031c02a | |||
c1ea688c5c | |||
|
ec37e3c1f6 | ||
|
50ade050c2 |
200 changed files with 506005 additions and 7016 deletions
17
.gitignore
vendored
17
.gitignore
vendored
|
@ -1,6 +1,23 @@
|
|||
*.xls
|
||||
*.xlsx
|
||||
*.ods
|
||||
*.tar.gz
|
||||
.Rhistory
|
||||
*.pyc
|
||||
__pycache__
|
||||
*/__pycache__
|
||||
manual_*
|
||||
*temp*
|
||||
mcsm_analysis_fixme
|
||||
meta_data_analysis
|
||||
del
|
||||
example*
|
||||
scratch
|
||||
historic
|
||||
test
|
||||
plotting_test
|
||||
*old*
|
||||
foldx/test/
|
||||
TO_DO
|
||||
.RData
|
||||
scratch_plots
|
||||
|
|
42
README.md
42
README.md
|
@ -1,35 +1,39 @@
|
|||
mCSM Analysis
|
||||
mCSM
|
||||
=============
|
||||
|
||||
This repo does mCSM analysis using Python, bash and R.
|
||||
|
||||
Requires an additional 'Data' directory. Batteries not included.
|
||||
This contains scripts that does the following:
|
||||
1. mcsm.py: function for submitting mcsm job and extracting results
|
||||
2. run_mcsm.py: wrapper to call mcsm.py
|
||||
|
||||
foldx
|
||||
=============
|
||||
This contains scripts that does the following:
|
||||
1. runFoldx.py: submitting foldx requests and extracting results
|
||||
2. runfoldx.sh: is wrapped by runFoldx.py
|
||||
|
||||
Requires an additional 'Data' directory. Batteries not included:-)
|
||||
|
||||
## Assumptions
|
||||
|
||||
1. git repos are cloned to `~/git`
|
||||
2. Requires a `Data/` in `~/git` which has the struc created by `mk_drug_dirs.sh`
|
||||
2. Requires a data directory with an `input` and `output` subdirs. Can be specified on the CLI with `--datadir`, and optionally can be created with `mk_drug_dirs.sh <DRUG_NAME>`
|
||||
|
||||
## LSHTM\_analysis:
|
||||
|
||||
subdirs within this repo
|
||||
|
||||
```
|
||||
meta\_data\_analysis/
|
||||
scripts
|
||||
*.R
|
||||
*.py
|
||||
|
||||
mcsm\_analysis/
|
||||
<drug>/
|
||||
scripts/
|
||||
*.R
|
||||
*.py
|
||||
mcsm/
|
||||
*.sh
|
||||
*.py
|
||||
*.R
|
||||
plotting/
|
||||
*.R
|
||||
plotting/
|
||||
*.R
|
||||
mcsm
|
||||
*.py
|
||||
foldx
|
||||
*.py
|
||||
*.sh
|
||||
|
||||
```
|
||||
More docs here as I write them.
|
||||
|
||||
More docs here as I write them.
|
||||
|
|
162
dynamut/format_results_dynamut.py
Executable file
162
dynamut/format_results_dynamut.py
Executable file
|
@ -0,0 +1,162 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
def format_dynamut_output(dynamut_output_csv):
|
||||
"""
|
||||
@param dynamut_output_csv: file containing dynamut results for all muts
|
||||
which is the result of combining all dynamut_output batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
This is post run_get_results_dynamut.py
|
||||
Formatting df to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted csv for dynamut output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = dynamut_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#%%============================================================================
|
||||
#####################################
|
||||
# create binary cols for each param
|
||||
# >=0: Stabilising
|
||||
######################################
|
||||
outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet']
|
||||
|
||||
# col test: ddg_dynamut
|
||||
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
|
||||
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
|
||||
|
||||
print('\nCreating classification cols for', len(outcome_cols), 'columns'
|
||||
, '\nThese are:')
|
||||
|
||||
for cols in outcome_cols:
|
||||
print(cols)
|
||||
|
||||
tot_muts = dynamut_data[cols].count()
|
||||
print('\nTotal entries:', tot_muts)
|
||||
|
||||
outcome_colname = cols + '_outcome'
|
||||
print(cols, ':', outcome_colname)
|
||||
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
|
||||
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
|
||||
if c1 == c2:
|
||||
print('\nPASS: outcome classification column created successfully'
|
||||
, '\nColumn created:', outcome_colname
|
||||
#, '\nNo. of stabilising muts: ', c1
|
||||
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
|
||||
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
|
||||
|
||||
else:
|
||||
print('\nFAIL: outcome classification numbers MISmatch'
|
||||
, '\nexpected length:', c1
|
||||
, '\nGot:', c2)
|
||||
|
||||
# Rename categ for: dds_encom
|
||||
len(dynamut_data[dynamut_data['dds_encom'] >= 0])
|
||||
dynamut_data['dds_encom_outcome'] = dynamut_data['dds_encom'].apply(lambda x: 'Increased_flexibility' if x >= 0 else 'Decreased_flexibility')
|
||||
dynamut_data['dds_encom_outcome'].value_counts()
|
||||
|
||||
#%%=====================================================================
|
||||
################################
|
||||
# scale all ddg param values
|
||||
#################################
|
||||
# Rescale values in all ddg cols col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
|
||||
outcome_cols = ['ddg_dynamut', 'ddg_encom', 'ddg_mcsm','ddg_sdm', 'ddg_duet', 'dds_encom']
|
||||
|
||||
for cols in outcome_cols:
|
||||
#print(cols)
|
||||
col_max = dynamut_data[cols].max()
|
||||
col_min = dynamut_data[cols].min()
|
||||
print( '\n===================='
|
||||
, '\nColname:', cols
|
||||
, '\n===================='
|
||||
, '\nMax: ', col_max
|
||||
, '\nMin: ', col_min)
|
||||
|
||||
scaled_colname = cols + '_scaled'
|
||||
print('\nCreated scaled colname for', cols, ':', scaled_colname)
|
||||
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
|
||||
|
||||
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
|
||||
|
||||
col_scaled_max = dynamut_data[scaled_colname].max()
|
||||
col_scaled_min = dynamut_data[scaled_colname].min()
|
||||
print( '\n===================='
|
||||
, '\nColname:', scaled_colname
|
||||
, '\n===================='
|
||||
, '\nMax: ', col_scaled_max
|
||||
, '\nMin: ', col_scaled_min)
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
dynamut_data.columns
|
||||
dynamut_data_f = dynamut_data[['mutationinformation'
|
||||
|
||||
, 'ddg_dynamut'
|
||||
, 'ddg_dynamut_scaled'
|
||||
, 'ddg_dynamut_outcome'
|
||||
|
||||
, 'ddg_encom'
|
||||
, 'ddg_encom_scaled'
|
||||
, 'ddg_encom_outcome'
|
||||
|
||||
, 'ddg_mcsm'
|
||||
, 'ddg_mcsm_scaled'
|
||||
, 'ddg_mcsm_outcome'
|
||||
|
||||
, 'ddg_sdm'
|
||||
, 'ddg_sdm_scaled'
|
||||
, 'ddg_sdm_outcome'
|
||||
|
||||
, 'ddg_duet'
|
||||
, 'ddg_duet_scaled'
|
||||
, 'ddg_duet_outcome'
|
||||
|
||||
, 'dds_encom'
|
||||
, 'dds_encom_scaled'
|
||||
, 'dds_encom_outcome']]
|
||||
|
||||
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
|
||||
print('\nPASS: outcome_classification, scaling and column reordering completed')
|
||||
else:
|
||||
print('\nFAIL: Something went wrong...'
|
||||
, '\nExpected length: ', len(dynamut_data.columns)
|
||||
, '\nGot: ', len(dynamut_data_f.columns))
|
||||
sys.exit()
|
||||
|
||||
return(dynamut_data_f)
|
||||
#%%#####################################################################
|
||||
|
137
dynamut/format_results_dynamut2.py
Executable file
137
dynamut/format_results_dynamut2.py
Executable file
|
@ -0,0 +1,137 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
def format_dynamut2_output(dynamut_output_csv):
|
||||
"""
|
||||
@param dynamut_output_csv: file containing dynamut2 results for all muts
|
||||
which is the result of combining all dynamut2_output batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
Dynamut2ran manually from batches
|
||||
Formatting df to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted csv for dynamut output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
dynamut_data_raw = pd.read_csv(dynamut_output_csv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
dynamut_data = dynamut_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = dynamut_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#%%============================================================================
|
||||
#####################################
|
||||
# create binary cols for ddg_dynamut2
|
||||
# >=0: Stabilising
|
||||
######################################
|
||||
outcome_cols = ['ddg_dynamut2']
|
||||
|
||||
# col test: ddg_dynamut
|
||||
#len(dynamut_data[dynamut_data['ddg_dynamut'] >= 0])
|
||||
#dynamut_data['ddg_dynamut_outcome'] = dynamut_data['ddg_dynamut'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
#len(dynamut_data[dynamut_data['ddg_dynamut_outcome'] == 'Stabilising'])
|
||||
|
||||
print('\nCreating classification cols for', len(outcome_cols), 'columns'
|
||||
, '\nThese are:')
|
||||
|
||||
for cols in outcome_cols:
|
||||
print(cols)
|
||||
|
||||
tot_muts = dynamut_data[cols].count()
|
||||
print('\nTotal entries:', tot_muts)
|
||||
|
||||
outcome_colname = cols + '_outcome'
|
||||
print(cols, ':', outcome_colname)
|
||||
c1 = len(dynamut_data[dynamut_data[cols] >= 0])
|
||||
dynamut_data[outcome_colname] = dynamut_data[cols].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
|
||||
c2 = len(dynamut_data[dynamut_data[outcome_colname] == 'Stabilising'])
|
||||
if c1 == c2:
|
||||
print('\nPASS: outcome classification column created successfully'
|
||||
, '\nColumn created:', outcome_colname
|
||||
#, '\nNo. of stabilising muts: ', c1
|
||||
#, '\nNo. of DEstabilising muts: ', tot_muts-c1
|
||||
, '\n\nCateg counts:\n', dynamut_data[outcome_colname].value_counts() )
|
||||
|
||||
else:
|
||||
print('\nFAIL: outcome classification numbers MISmatch'
|
||||
, '\nexpected length:', c1
|
||||
, '\nGot:', c2)
|
||||
|
||||
#%%=====================================================================
|
||||
################################
|
||||
# scale all ddg_dynamut2 values
|
||||
#################################
|
||||
# Rescale values in all ddg_dynamut2 col col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
|
||||
outcome_cols = ['ddg_dynamut2']
|
||||
|
||||
for cols in outcome_cols:
|
||||
#print(cols)
|
||||
col_max = dynamut_data[cols].max()
|
||||
col_min = dynamut_data[cols].min()
|
||||
print( '\n===================='
|
||||
, '\nColname:', cols
|
||||
, '\n===================='
|
||||
, '\nMax: ', col_max
|
||||
, '\nMin: ', col_min)
|
||||
|
||||
scaled_colname = cols + '_scaled'
|
||||
print('\nCreated scaled colname for', cols, ':', scaled_colname)
|
||||
col_scale = lambda x : x/abs(col_min) if x < 0 else (x/col_max if x >= 0 else 'failed')
|
||||
|
||||
dynamut_data[scaled_colname] = dynamut_data[cols].apply(col_scale)
|
||||
|
||||
col_scaled_max = dynamut_data[scaled_colname].max()
|
||||
col_scaled_min = dynamut_data[scaled_colname].min()
|
||||
print( '\n===================='
|
||||
, '\nColname:', scaled_colname
|
||||
, '\n===================='
|
||||
, '\nMax: ', col_scaled_max
|
||||
, '\nMin: ', col_scaled_min)
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
dynamut_data.columns
|
||||
dynamut_data_f = dynamut_data[['mutationinformation'
|
||||
, 'chain'
|
||||
, 'ddg_dynamut2'
|
||||
, 'ddg_dynamut2_scaled'
|
||||
, 'ddg_dynamut2_outcome']]
|
||||
|
||||
if len(dynamut_data.columns) == len(dynamut_data_f.columns) and sorted(dynamut_data.columns) == sorted(dynamut_data_f.columns):
|
||||
print('\nPASS: outcome_classification, scaling and column reordering completed')
|
||||
else:
|
||||
print('\nFAIL: Something went wrong...'
|
||||
, '\nExpected length: ', len(dynamut_data.columns)
|
||||
, '\nGot: ', len(dynamut_data_f.columns))
|
||||
sys.exit()
|
||||
|
||||
return(dynamut_data_f)
|
||||
#%%#####################################################################
|
||||
|
98
dynamut/get_results_dynamut.py
Executable file
98
dynamut/get_results_dynamut.py
Executable file
|
@ -0,0 +1,98 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
|
||||
def get_results(url_file, host_url, output_dir, outfile_suffix):
|
||||
# initilialise empty df
|
||||
dynamut_results_out_df = pd.DataFrame()
|
||||
with open(url_file, 'r') as f:
|
||||
for count, line in enumerate(f):
|
||||
line = line.strip()
|
||||
print('URL no.', count+1, '\n', line)
|
||||
#batch_response = requests.get(line, headers=headers)
|
||||
batch_response = requests.get(line)
|
||||
batch_soup = BeautifulSoup(batch_response.text, features = 'html.parser')
|
||||
|
||||
# initilialise empty df
|
||||
#dynamut_results_df = pd.DataFrame()
|
||||
for a in batch_soup.find_all('a', href=True, attrs = {'class':'btn btn-default btn-sm'}):
|
||||
print ("Found the URL:", a['href'])
|
||||
single_result_url = host_url + a['href']
|
||||
snp = re.search(r'([A-Z]+[0-9]+[A-Z]+$)', single_result_url).group(0)
|
||||
print(snp)
|
||||
print('\nGetting results from:', single_result_url)
|
||||
|
||||
result_response = requests.get(single_result_url)
|
||||
if result_response.status_code == 200:
|
||||
print('\nFetching results for SNP:', snp)
|
||||
# extract results using the html parser
|
||||
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
||||
#web_result_raw = soup.find(id = 'predictions').get_text()
|
||||
ddg_dynamut = soup.find(id = 'ddg_dynamut').get_text()
|
||||
ddg_encom = soup.find(id = 'ddg_encom').get_text()
|
||||
ddg_mcsm = soup.find(id = 'ddg_mcsm').get_text()
|
||||
ddg_sdm = soup.find(id = 'ddg_sdm').get_text()
|
||||
ddg_duet = soup.find(id = 'ddg_duet').get_text()
|
||||
dds_encom = soup.find(id = 'dds_encom').get_text()
|
||||
|
||||
param_dict = {"mutationinformation" : snp
|
||||
, "ddg_dynamut" : ddg_dynamut
|
||||
, "ddg_encom" : ddg_encom
|
||||
, "ddg_mcsm" : ddg_mcsm
|
||||
, "ddg_sdm" : ddg_sdm
|
||||
, "ddg_duet" : ddg_duet
|
||||
, "dds_encom" : dds_encom
|
||||
}
|
||||
results_df = pd.DataFrame.from_dict(param_dict, orient = "index").T
|
||||
print('Result DF:', results_df, 'for URL:', line)
|
||||
#dynamut_results_df = dynamut_results_df.append(results_df)#!1 too many!:-)
|
||||
dynamut_results_out_df = dynamut_results_out_df.append(results_df)
|
||||
#print(dynamut_results_out_df)
|
||||
#============================
|
||||
# Writing results file: csv
|
||||
#============================
|
||||
dynamut_results_dir = output_dir + 'dynamut_results/'
|
||||
if not os.path.exists(dynamut_results_dir):
|
||||
print('\nCreating dir: dynamut_results within:', output_dir )
|
||||
os.makedirs(dynamut_results_dir)
|
||||
print('\nWriting dynamut results df')
|
||||
print('\nResults File:'
|
||||
, '\nNo. of rows:', dynamut_results_out_df.shape[0]
|
||||
, '\nNo. of cols:', dynamut_results_out_df.shape[1])
|
||||
print(dynamut_results_out_df)
|
||||
#dynamut_results_out_df.to_csv('/tmp/test_dynamut.csv', index = False)
|
||||
|
||||
# build out filename
|
||||
out_filename = dynamut_results_dir + 'dynamut_output_' + outfile_suffix + '.csv'
|
||||
dynamut_results_out_df.to_csv(out_filename, index = False)
|
||||
|
||||
# TODO: add as a cmd option
|
||||
# Download .tar.gz file
|
||||
prediction_number = re.search(r'([0-9]+$)', line).group(0)
|
||||
tgz_url = f"{host_url}/dynamut/results_file/results_" + prediction_number + '.tar.gz'
|
||||
tgz_filename = dynamut_results_dir + outfile_suffix + '_results_' + prediction_number + '.tar.gz'
|
||||
response_tgz = requests.get(tgz_url, stream = True)
|
||||
if response_tgz.status_code == 200:
|
||||
print('\nDownloading tar.gz file:', tgz_url
|
||||
, '\n\nSaving file as:', tgz_filename)
|
||||
with open(tgz_filename, 'wb') as f:
|
||||
f.write(response_tgz.raw.read())
|
||||
|
||||
#%%#####################################################################
|
||||
|
101
dynamut/run_format_results_dynamut.py
Executable file
101
dynamut/run_format_results_dynamut.py
Executable file
|
@ -0,0 +1,101 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# FIXME
|
||||
# RE RUN when B07 completes!!!! as norm gets affected!
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
|
||||
from format_results_dynamut import *
|
||||
from format_results_dynamut2 import *
|
||||
########################################################################
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
||||
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
||||
|
||||
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#%%============================================================================
|
||||
# variable assignment: input and output paths & filenames
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
#outdir_dynamut2 = args.mkdir_name
|
||||
make_dirs = args.make_dirs
|
||||
|
||||
#=======
|
||||
# dirs
|
||||
#=======
|
||||
if not datadir:
|
||||
datadir = homedir + '/git/Data/'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + drug + '/input/'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
||||
#if not mkdir_name:
|
||||
outdir_dynamut = outdir + 'dynamut_results/'
|
||||
outdir_dynamut2 = outdir + 'dynamut_results/dynamut2/'
|
||||
|
||||
# Input file
|
||||
infile_dynamut = outdir_dynamut + gene + '_dynamut_all_output_clean.csv'
|
||||
infile_dynamut2 = outdir_dynamut2 + gene + '_dynamut2_output_combined_clean.csv'
|
||||
|
||||
# Formatted output filename
|
||||
outfile_dynamut_f = outdir_dynamut2 + gene + '_dynamut_norm.csv'
|
||||
outfile_dynamut2_f = outdir_dynamut2 + gene + '_dynamut2_norm.csv'
|
||||
#%%========================================================================
|
||||
|
||||
#===============================
|
||||
# CALL: format_results_dynamut
|
||||
# DYNAMUT results
|
||||
# #===============================
|
||||
# print('Formatting results for:', infile_dynamut)
|
||||
# dynamut_df_f = format_dynamut_output(infile_dynamut)
|
||||
# # writing file
|
||||
# print('Writing formatted dynamut df to csv')
|
||||
# dynamut_df_f.to_csv(outfile_dynamut_f, index = False)
|
||||
|
||||
# print('Finished writing file:'
|
||||
# , '\nFile:', outfile_dynamut_f
|
||||
# , '\nExpected no. of rows:', len(dynamut_df_f)
|
||||
# , '\nExpected no. of cols:', len(dynamut_df_f.columns)
|
||||
# , '\n=============================================================')
|
||||
|
||||
#===============================
|
||||
# CALL: format_results_dynamut2
|
||||
# DYNAMUT2 results
|
||||
#===============================
|
||||
print('Formatting results for:', infile_dynamut2)
|
||||
dynamut2_df_f = format_dynamut2_output(infile_dynamut2) # dynamut2
|
||||
|
||||
# writing file
|
||||
print('Writing formatted dynamut2 df to csv')
|
||||
dynamut2_df_f.to_csv(outfile_dynamut2_f, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_dynamut2_f
|
||||
, '\nExpected no. of rows:', len(dynamut2_df_f)
|
||||
, '\nExpected no. of cols:', len(dynamut2_df_f.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
#%%#####################################################################
|
44
dynamut/run_get_results_dynamut.py
Executable file
44
dynamut/run_get_results_dynamut.py
Executable file
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
|
||||
from get_results_dynamut import *
|
||||
########################################################################
|
||||
# variables
|
||||
my_host = 'http://biosig.unimelb.edu.au'
|
||||
# Needed if things try to block the 'requests' user agent
|
||||
#headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
|
||||
|
||||
# TODO: add cmd line args
|
||||
#gene = ''
|
||||
#drug = ''
|
||||
datadir = homedir + '/git/Data/'
|
||||
indir = datadir + drug + '/input/'
|
||||
outdir = datadir + drug + '/output/'
|
||||
outdir_dynamut_temp = outdir + 'dynamut_results/dynamut_temp/'
|
||||
#==============================================================================
|
||||
# batch 7 (previously 1b file): RETRIEVED 17 Aug 16:40
|
||||
my_url_file = outdir_dynamut_temp + 'dynamut_result_url_gid_b7.txt'
|
||||
my_suffix = 'gid_b7'
|
||||
#==============================================================================
|
||||
|
||||
#==========================
|
||||
# CALL: get_results()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
# output file saves in dynamut_results/ (created if it doesn't exist) inside outdir
|
||||
print('Fetching results from url file :', my_url_file, '\nsuffix:', my_suffix)
|
||||
|
||||
get_results(url_file = my_url_file
|
||||
, host_url = my_host
|
||||
, output_dir = outdir
|
||||
, outfile_suffix = my_suffix)
|
||||
|
||||
########################################################################
|
58
dynamut/run_submit_dynamut.py
Executable file
58
dynamut/run_submit_dynamut.py
Executable file
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/dynamut')
|
||||
from submit_dynamut import *
|
||||
########################################################################
|
||||
# variables
|
||||
my_host = 'http://biosig.unimelb.edu.au'
|
||||
my_prediction_url = f"{my_host}/dynamut/prediction_list"
|
||||
print(my_prediction_url)
|
||||
|
||||
# TODO: add cmd line args
|
||||
gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
datadir = homedir + '/git/Data/'
|
||||
indir = datadir + drug + '/input/'
|
||||
outdir = datadir + drug + '/output/'
|
||||
outdir_dynamut = outdir + 'dynamut_results/'
|
||||
|
||||
my_chain = 'A'
|
||||
my_email = 'tanushree.tunstall@lshtm.ac.uk'
|
||||
|
||||
#my_pdb_file = indir + 'gid_complex.pdb'
|
||||
my_pdb_file = indir + gene + '_complex.pdb'
|
||||
#==============================================================================
|
||||
# Rerunnig batch 7: 07.txt, # RAN: 12 Aug 15:22, 0 bytes file from previous run!
|
||||
my_mutation_list = outdir + 'snp_batches/50/snp_batch_07.txt'
|
||||
my_suffix = 'gid_b7'
|
||||
#==============================================================================
|
||||
|
||||
#==========================
|
||||
# CALL: submit_dynamut()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('\nSubmitting batch for:'
|
||||
, '\nFilename : ' , my_mutation_list
|
||||
, '\nbatch : ' , my_suffix
|
||||
, '\ndrug : ' , drug
|
||||
, '\ngene : ' , gene
|
||||
, '\npdb file : ' , my_pdb_file)
|
||||
|
||||
submit_dynamut(host_url = my_host
|
||||
, pdb_file = my_pdb_file
|
||||
, mutation_list = my_mutation_list
|
||||
, chain = my_chain
|
||||
, email_address = my_email
|
||||
, prediction_url = my_prediction_url
|
||||
, output_dir = outdir_dynamut
|
||||
, outfile_suffix = my_suffix)
|
||||
|
||||
#%%#####################################################################
|
19
dynamut/split_csv.sh
Executable file
19
dynamut/split_csv.sh
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
|
||||
|
||||
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
|
||||
# copy your snp file to split into the dynamut dir
|
||||
|
||||
INFILE=$1
|
||||
OUTDIR=$2
|
||||
CHUNK=$3
|
||||
|
||||
mkdir -p ${OUTDIR}/${CHUNK}
|
||||
cd ${OUTDIR}/${CHUNK}
|
||||
|
||||
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
|
||||
|
||||
# use case
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
|
||||
#~/git/LSHTM_analysis/dynamut/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
|
89
dynamut/submit_dynamut.py
Executable file
89
dynamut/submit_dynamut.py
Executable file
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
def submit_dynamut(host_url
|
||||
, pdb_file
|
||||
, mutation_list
|
||||
, chain
|
||||
, email_address
|
||||
, prediction_url
|
||||
, output_dir
|
||||
, outfile_suffix
|
||||
):
|
||||
"""
|
||||
Makes a POST request for dynamut predictions.
|
||||
|
||||
@param host_url: valid host url for submitting the job
|
||||
@type string
|
||||
|
||||
@param pdb_file: valid path to pdb structure
|
||||
@type string
|
||||
|
||||
@param mutation_list: list of mutations (1 per line) of the format: {WT}<POS>{Mut}
|
||||
@type string
|
||||
|
||||
@param chain: single-letter(caps)
|
||||
@type chr
|
||||
|
||||
@param email_address: email address to inform of results
|
||||
@type chr
|
||||
|
||||
@param prediction_url: dynamut url for prediction
|
||||
@type string
|
||||
|
||||
@param output_dir: output dir
|
||||
@type string
|
||||
|
||||
@param outfile_suffix: to append to outfile
|
||||
@type string
|
||||
|
||||
@return writes a .txt file containing url for the snps processed with user provided suffix in filename
|
||||
@type string
|
||||
"""
|
||||
|
||||
with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
|
||||
files = {"wild": pdb_file
|
||||
, "mutation_list": mutation_list}
|
||||
body = {"chain": chain
|
||||
, "email": email_address}
|
||||
|
||||
response = requests.post(prediction_url, files = files, data = body)
|
||||
print(response.status_code)
|
||||
if response.history:
|
||||
print('\nPASS: valid submission. Fetching result url')
|
||||
url_match = re.search('/dynamut/results_prediction/.+(?=")', response.text)
|
||||
url = host_url + url_match.group()
|
||||
print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
|
||||
|
||||
#===============
|
||||
# writing file: result urls
|
||||
#===============
|
||||
dynamut_temp_dir = output_dir + 'dynamut_temp/' # creates a temp dir within output_dir
|
||||
if not os.path.exists(dynamut_temp_dir):
|
||||
print('\nCreating dynamut_temp in output_dir', output_dir )
|
||||
os.makedirs(dynamut_temp_dir)
|
||||
|
||||
out_url_file = dynamut_temp_dir + 'dynamut_result_url_' + str(outfile_suffix) + '.txt'
|
||||
print('\nWriting output url file:', out_url_file
|
||||
, '\nNow we wait patiently...')
|
||||
|
||||
myfile = open(out_url_file, 'a')
|
||||
myfile.write(url)
|
||||
myfile.close()
|
||||
#%%#####################################################################
|
3
foldx/cmd_change
Normal file
3
foldx/cmd_change
Normal file
|
@ -0,0 +1,3 @@
|
|||
sed -i s/'\/Users\/Charlotte\/Downloads\/foldxMacC11\/' '\/home\/tanu\/git\/LSHTM_analysis\/foldx\/\/'/g *.sh
|
||||
|
||||
rm *.txt *.fxout *Repai*pdb
|
68
foldx/deprecated_shell_scripts/mutrenamefiles_mac.sh
Executable file
68
foldx/deprecated_shell_scripts/mutrenamefiles_mac.sh
Executable file
|
@ -0,0 +1,68 @@
|
|||
PDB=$1
|
||||
n=$2
|
||||
#cd /home/tanu/git/LSHTM_analysis/foldx/
|
||||
logger "Running mutrenamefiles_mac"
|
||||
cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,4d Matrix_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,2d AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,2d AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i .bak -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
10
foldx/deprecated_shell_scripts/mutruncomplex.sh
Executable file
10
foldx/deprecated_shell_scripts/mutruncomplex.sh
Executable file
|
@ -0,0 +1,10 @@
|
|||
PDB=$1
|
||||
A=$2
|
||||
B=$3
|
||||
n=$4
|
||||
OUTDIR=$5
|
||||
cd ${OUTDIR}
|
||||
logger "Running mutruncomplex"
|
||||
foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
|
||||
cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
|
||||
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
|
68
foldx/deprecated_shell_scripts/renamefiles_mac.sh
Executable file
68
foldx/deprecated_shell_scripts/renamefiles_mac.sh
Executable file
|
@ -0,0 +1,68 @@
|
|||
PDB=$1
|
||||
logger "Running renamefiles_mac"
|
||||
#cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
|
||||
sed -i '.bak' -e 1,8d Dif_${PDB}_Repair.txt
|
||||
cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,4d Matrix_Distances_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,2d AllAtoms_Disulfide_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,2d AllAtoms_Electro_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,2d AllAtoms_Hbonds_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,2d AllAtoms_Partcov_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,2d AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,2d AllAtoms_Volumetric_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_Distances_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_Electro_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_Partcov_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -i '.bak' -e 1,5d InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
9
foldx/deprecated_shell_scripts/repairPDB.sh
Executable file
9
foldx/deprecated_shell_scripts/repairPDB.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
INDIR=$1
|
||||
PDB=$2
|
||||
OUTDIR=$3
|
||||
|
||||
logger "Running repairPDB"
|
||||
|
||||
#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
|
||||
|
||||
foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
|
336
foldx/deprecated_shell_scripts/runFoldx.py
Executable file
336
foldx/deprecated_shell_scripts/runFoldx.py
Executable file
|
@ -0,0 +1,336 @@
|
|||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
import re
|
||||
import csv
|
||||
import argparse
|
||||
#https://realpython.com/python-pathlib/
|
||||
|
||||
# FIXME
|
||||
#strong dependency of file and path names
|
||||
#cannot pass file with path. Need to pass them separately
|
||||
#assumptions made for dir struc as standard
|
||||
#datadir + drug + input
|
||||
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
|
||||
os.getcwd()
|
||||
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
|
||||
|
||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
|
||||
|
||||
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
|
||||
|
||||
# FIXME: Doesn't work with 2 chains yet!
|
||||
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
|
||||
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
#gene_match = gene + '_p.'
|
||||
#%%=====================================================================
|
||||
# Command line options
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
process_dir = args.process_dir
|
||||
|
||||
mut_filename = args.mutation_file
|
||||
chainA = args.chain1
|
||||
chainB = args.chain2
|
||||
pdb_filename = args.pdb_file
|
||||
|
||||
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
||||
#pdb_name = os.path.splitext(pdb_file)[0]
|
||||
# Just the filename, thanks
|
||||
#pdb_name = Path(in_filename_pdb).stem
|
||||
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
if not datadir:
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
|
||||
if not process_dir:
|
||||
process_dir = datadir + '/' + drug +'/' + 'processing'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
# FIXME
|
||||
if pdb_filename:
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
else:
|
||||
pdb_filename = gene.lower() + '_complex.pdb'
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
|
||||
infile_pdb = indir + '/' + pdb_filename
|
||||
actual_pdb_filename = Path(infile_pdb).name
|
||||
|
||||
if mut_filename:
|
||||
mutation_file = mut_filename
|
||||
else:
|
||||
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
|
||||
infile_muts = outdir + '/' + mutation_file
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
out_filename = gene.lower() + '_foldx.csv'
|
||||
outfile_foldx = outdir + '/' + out_filename
|
||||
|
||||
print('Arguments being passed:'
|
||||
, '\nDrug:', args.drug
|
||||
, '\ngene:', args.gene
|
||||
, '\ninput dir:', indir
|
||||
, '\noutput dir:', outdir
|
||||
, '\npdb file:', infile_pdb
|
||||
, '\npdb name:', pdb_name
|
||||
, '\nactual pdb name:', actual_pdb_filename
|
||||
, '\nmutation file:', infile_muts
|
||||
, '\nchain1:', args.chain1
|
||||
, '\noutput file:', outfile_foldx
|
||||
, '\n=============================================================')
|
||||
#=======================================================================
|
||||
|
||||
def getInteractionEnergy(filename):
|
||||
data = pd.read_csv(filename,sep = '\t')
|
||||
return data['Interaction Energy'].loc[0]
|
||||
|
||||
def getInteractions(filename):
|
||||
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
|
||||
contactList = getIndexes(data,1)
|
||||
number = len(contactList)
|
||||
return number
|
||||
|
||||
def formatMuts(mut_file,pdbname):
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
muts = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
muts.append(mut)
|
||||
|
||||
mut_list = []
|
||||
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(outfile, 'w') as output:
|
||||
for m in muts:
|
||||
print(m)
|
||||
mut = m[:1] + chainA+ m[1:]
|
||||
mut_list.append(mut)
|
||||
mut = mut + ';'
|
||||
print(mut)
|
||||
output.write(mut)
|
||||
output.write('\n')
|
||||
return mut_list
|
||||
|
||||
def getIndexes(data, value):
|
||||
colnames = data.columns.values
|
||||
listOfPos = list()
|
||||
result = data.isin([value])
|
||||
result.columns = colnames
|
||||
seriesdata = result.any()
|
||||
columnNames = list(seriesdata[seriesdata==True].index)
|
||||
for col in columnNames:
|
||||
rows = list(result[col][result[col]==True].index)
|
||||
|
||||
for row in rows:
|
||||
listOfPos.append((row,col))
|
||||
|
||||
return listOfPos
|
||||
|
||||
def loadFiles(df):
|
||||
# load a text file in to np matrix
|
||||
resultList = []
|
||||
f = open(df,'r')
|
||||
for line in f:
|
||||
line = line.rstrip('\n')
|
||||
aVals = line.split('\t')
|
||||
fVals = list(map(np.float32, sVals))
|
||||
resultList.append(fVals)
|
||||
f.close()
|
||||
return np.asarray(resultList, dtype=np.float32)
|
||||
|
||||
#=======================================================================
|
||||
def main():
|
||||
pdbname = pdb_name
|
||||
comp = '' # for complex only
|
||||
mut_filename = infile_muts #pnca_mcsm_snps.csv
|
||||
mutlist = formatMuts(mut_filename, pdbname)
|
||||
|
||||
print(mutlist)
|
||||
nmuts = len(mutlist)
|
||||
print(nmuts)
|
||||
print(mutlist)
|
||||
print('start')
|
||||
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
|
||||
subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
|
||||
|
||||
print('end')
|
||||
output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
|
||||
|
||||
for n in range(1,nmuts+1):
|
||||
print(n)
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
|
||||
|
||||
for n in range(1,nmuts+1):
|
||||
print(n)
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
|
||||
|
||||
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
|
||||
|
||||
if comp=='y':
|
||||
chain1=chainA
|
||||
chain2=chainB
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
|
||||
for n in range(1,nmuts+1):
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
|
||||
|
||||
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
|
||||
'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
|
||||
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
|
||||
dGdata = pd.read_csv(dGdatafile, sep = '\t')
|
||||
|
||||
ddG=[]
|
||||
print('ddG')
|
||||
print(len(dGdata))
|
||||
for i in range(0,len(dGdata)):
|
||||
ddG.append(dGdata['total energy'].loc[i])
|
||||
|
||||
|
||||
nint = len(interactions)
|
||||
wt_int = []
|
||||
|
||||
for i in interactions:
|
||||
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
|
||||
wt_int.append(getInteractions(filename))
|
||||
print('wt')
|
||||
print(wt_int)
|
||||
|
||||
ntotal = nint+1
|
||||
print(ntotal)
|
||||
print(nmuts)
|
||||
data = np.empty((ntotal,nmuts))
|
||||
data[0] = ddG
|
||||
print(data)
|
||||
for i in range(0,len(interactions)):
|
||||
d=[]
|
||||
p=0
|
||||
for n in range(1, nmuts+1):
|
||||
print(i)
|
||||
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
|
||||
mut = getInteractions(filename)
|
||||
diff = wt_int[i] - mut
|
||||
print(diff)
|
||||
print(wt_int[i])
|
||||
print(mut)
|
||||
d.append(diff)
|
||||
print(d)
|
||||
data[i+1] = d
|
||||
|
||||
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
|
||||
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
print(interactions)
|
||||
|
||||
IE = []
|
||||
if comp=='y':
|
||||
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
wtE = getInteractionEnergy(wtfilename)
|
||||
print(wtE)
|
||||
for n in range(1,nmuts+1):
|
||||
print(n)
|
||||
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
|
||||
mutE = getInteractionEnergy(filename)
|
||||
print(mutE)
|
||||
diff = wtE - mutE
|
||||
print(diff)
|
||||
IE.append(diff)
|
||||
print(IE)
|
||||
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
|
||||
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
|
||||
IEresults.to_csv(IEfilename)
|
||||
print(len(IE))
|
||||
data = np.append(data,[IE], axis = 0)
|
||||
print(data)
|
||||
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
|
||||
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
|
||||
|
||||
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
mutlist = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
mutlist.append(mut)
|
||||
print(mutlist)
|
||||
print(len(mutlist))
|
||||
print(data)
|
||||
results = pd.DataFrame(data, columns = mutlist, index = interactions)
|
||||
results.append(ddG)
|
||||
#print(results.head())
|
||||
|
||||
# my style formatted results
|
||||
results2 = results.T # transpose df
|
||||
results2.index.name = 'mutationinformation' # assign name to index
|
||||
results2 = results2.reset_index() # turn it into a columns
|
||||
|
||||
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
|
||||
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
|
||||
|
||||
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
|
||||
|
||||
# lower case columns
|
||||
results2.columns = results2.columns.str.lower()
|
||||
|
||||
print('Writing file in the format below:\n'
|
||||
, results2.head()
|
||||
, '\nNo. of rows:', len(results2)
|
||||
, '\nNo. of cols:', len(results2.columns))
|
||||
|
||||
outputfilename = outfile_foldx
|
||||
#outputfilename = 'foldx_results_' + pdbname + '.csv'
|
||||
#results.to_csv(outputfilename)
|
||||
results2.to_csv(outputfilename, index = False)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
7
foldx/deprecated_shell_scripts/runPrintNetworks.sh
Executable file
7
foldx/deprecated_shell_scripts/runPrintNetworks.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
PDB=$1
|
||||
n=$2
|
||||
OUTDIR=$3
|
||||
logger "Running runPrintNetworks"
|
||||
cd ${OUTDIR}
|
||||
|
||||
foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
9
foldx/deprecated_shell_scripts/runcomplex.sh
Executable file
9
foldx/deprecated_shell_scripts/runcomplex.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
PDB=$1
|
||||
A=$2
|
||||
B=$3
|
||||
OUTDIR=$4
|
||||
cd ${OUTDIR}
|
||||
logger "Running runcomplex"
|
||||
foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
||||
cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
|
||||
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
|
9
foldx/deprecated_shell_scripts/runfoldx.sh
Executable file
9
foldx/deprecated_shell_scripts/runfoldx.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
PDB=$1
|
||||
OUTDIR=$2
|
||||
cd ${OUTDIR}
|
||||
pwd
|
||||
ls
|
||||
logger "Running runfoldx"
|
||||
foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
|
||||
foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
||||
foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
63
foldx/mutrenamefiles.sh
Executable file
63
foldx/mutrenamefiles.sh
Executable file
|
@ -0,0 +1,63 @@
|
|||
PDB=$1
|
||||
n=$2
|
||||
OUTDIR=$3
|
||||
cd ${OUTDIR}
|
||||
|
||||
cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
|
64
foldx/renamefiles.sh
Executable file
64
foldx/renamefiles.sh
Executable file
|
@ -0,0 +1,64 @@
|
|||
PDB=$1
|
||||
OUTDIR=$2
|
||||
cd ${OUTDIR}
|
||||
|
||||
cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
|
||||
sed -i '1,8d' Dif_${PDB}_Repair.txt
|
||||
cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
|
||||
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
|
239965
foldx/rotabase.txt
Normal file
239965
foldx/rotabase.txt
Normal file
File diff suppressed because it is too large
Load diff
466
foldx/runFoldx.py
Executable file
466
foldx/runFoldx.py
Executable file
|
@ -0,0 +1,466 @@
|
|||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
import re
|
||||
import csv
|
||||
import argparse
|
||||
import shutil
|
||||
import time
|
||||
#https://realpython.com/python-pathlib/
|
||||
|
||||
# FIXME
|
||||
#strong dependency of file and path names
|
||||
#cannot pass file with path. Need to pass them separately
|
||||
#assumptions made for dir struc as standard
|
||||
#datadir + drug + input
|
||||
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
|
||||
#os.getcwd()
|
||||
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
|
||||
|
||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
|
||||
|
||||
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
|
||||
|
||||
# FIXME: Doesn't work with 2 chains yet!
|
||||
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
|
||||
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
#gene_match = gene + '_p.'
|
||||
#%%=====================================================================
|
||||
# Command line options
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
process_dir = args.process_dir
|
||||
|
||||
mut_filename = args.mutation_file
|
||||
chainA = args.chain1
|
||||
chainB = args.chain2
|
||||
pdb_filename = args.pdb_file
|
||||
|
||||
|
||||
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
||||
#pdb_name = os.path.splitext(pdb_file)[0]
|
||||
# Just the filename, thanks
|
||||
#pdb_name = Path(in_filename_pdb).stem
|
||||
|
||||
|
||||
# Handle the case where neither 'drug'
|
||||
# nor (indir,outdir,process_dir) are defined
|
||||
if not drug:
|
||||
if not indir or not outdir or not process_dir:
|
||||
print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
|
||||
sys.exit()
|
||||
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
if not datadir:
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
|
||||
if not process_dir:
|
||||
process_dir = datadir + '/' + drug + '/processing'
|
||||
|
||||
# Make all paths absolute in case the user forgot
|
||||
indir = os.path.abspath(indir)
|
||||
process_dir = os.path.abspath(process_dir)
|
||||
outdir = os.path.abspath(outdir)
|
||||
datadir = os.path.abspath(datadir)
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
# FIXME
|
||||
if pdb_filename:
|
||||
pdb_filename = os.path.abspath(pdb_filename)
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
infile_pdb = pdb_filename
|
||||
else:
|
||||
pdb_filename = gene.lower() + '_complex.pdb'
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
infile_pdb = indir + '/' + pdb_filename
|
||||
|
||||
actual_pdb_filename = Path(infile_pdb).name
|
||||
|
||||
if mut_filename:
|
||||
mutation_file = os.path.abspath(mut_filename)
|
||||
infile_muts = mutation_file
|
||||
print('User-provided mutation file in use:', infile_muts)
|
||||
else:
|
||||
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
infile_muts = outdir + '/' + mutation_file
|
||||
print('WARNING: Assuming default mutation file:', infile_muts)
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
out_filename = gene.lower() + '_foldx.csv'
|
||||
outfile_foldx = outdir + '/' + out_filename
|
||||
|
||||
print('Arguments being passed:'
|
||||
, '\nDrug:', args.drug
|
||||
, '\ngene:', args.gene
|
||||
, '\ninput dir:', indir
|
||||
, '\nprocess dir:', process_dir
|
||||
, '\noutput dir:', outdir
|
||||
, '\npdb file:', infile_pdb
|
||||
, '\npdb name:', pdb_name
|
||||
, '\nactual pdb name:', actual_pdb_filename
|
||||
, '\nmutation file:', infile_muts
|
||||
, '\nchain1:', args.chain1
|
||||
, '\noutput file:', outfile_foldx
|
||||
, '\n=============================================================')
|
||||
|
||||
#### Delay for 10 seconds to check the params ####
|
||||
print('Sleeping for 10 seconds to give you time to cancel')
|
||||
time.sleep(10)
|
||||
#=======================================================================
|
||||
|
||||
def getInteractionEnergy(filename):
|
||||
data = pd.read_csv(filename,sep = '\t')
|
||||
return data['Interaction Energy'].loc[0]
|
||||
|
||||
def getInteractions(filename):
|
||||
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
|
||||
contactList = getIndexes(data,1)
|
||||
number = len(contactList)
|
||||
return number
|
||||
|
||||
def formatMuts(mut_file,pdbname):
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
muts = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
muts.append(mut)
|
||||
|
||||
mut_list = []
|
||||
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(outfile, 'w') as output:
|
||||
for m in muts:
|
||||
print(m)
|
||||
mut = m[:1] + chainA+ m[1:]
|
||||
mut_list.append(mut)
|
||||
mut = mut + ';'
|
||||
print(mut)
|
||||
output.write(mut)
|
||||
output.write('\n')
|
||||
return mut_list
|
||||
|
||||
def getIndexes(data, value):
|
||||
colnames = data.columns.values
|
||||
listOfPos = list()
|
||||
result = data.isin([value])
|
||||
result.columns = colnames
|
||||
seriesdata = result.any()
|
||||
columnNames = list(seriesdata[seriesdata==True].index)
|
||||
for col in columnNames:
|
||||
rows = list(result[col][result[col]==True].index)
|
||||
|
||||
for row in rows:
|
||||
listOfPos.append((row,col))
|
||||
|
||||
return listOfPos
|
||||
|
||||
def loadFiles(df):
|
||||
# load a text file in to np matrix
|
||||
resultList = []
|
||||
f = open(df,'r')
|
||||
for line in f:
|
||||
line = line.rstrip('\n')
|
||||
aVals = line.split('\t')
|
||||
fVals = list(map(np.float32, sVals))
|
||||
resultList.append(fVals)
|
||||
f.close()
|
||||
return np.asarray(resultList, dtype=np.float32)
|
||||
|
||||
# TODO: put the subprocess call in a 'def'
|
||||
#def repairPDB():
|
||||
# subprocess.call(['foldx'
|
||||
# , '--command=RepairPDB'
|
||||
# , '--pdb-dir=' + indir
|
||||
# , '--pdb=' + actual_pdb_filename
|
||||
# , '--ionStrength=0.05'#
|
||||
# , '--pH=7'
|
||||
# , '--water=PREDICT'
|
||||
# , '--vdwDesign=1'
|
||||
# , 'outPDB=true'
|
||||
# , '--output-dir=' + process_dir])
|
||||
|
||||
#=======================================================================
|
||||
def main():
|
||||
pdbname = pdb_name
|
||||
comp = '' # for complex only
|
||||
mut_filename = infile_muts #pnca_mcsm_snps.csv
|
||||
mutlist = formatMuts(mut_filename, pdbname)
|
||||
|
||||
print(mutlist)
|
||||
nmuts = len(mutlist)
|
||||
print(nmuts)
|
||||
print(mutlist)
|
||||
|
||||
print('start')
|
||||
# some common parameters for foldX
|
||||
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
|
||||
|
||||
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
|
||||
print('Running foldx RepairPDB for WT')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=RepairPDB'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + os.path.dirname(pdb_filename)
|
||||
, '--pdb=' + actual_pdb_filename
|
||||
, 'outPDB=true'
|
||||
, '--output-dir=' + process_dir])
|
||||
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
|
||||
print('Running foldx BuildModel for WT')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=BuildModel'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--mutant-file="individual_list_' + pdbname +'.txt"'
|
||||
, 'outPDB=true'
|
||||
, '--numberOfRuns=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
print('Running foldx PrintNetworks for WT')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
print('Running foldx SequenceDetail for WT')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=SequenceDetail'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mNETWORK:\033[0m', n)
|
||||
print('Running foldx PrintNetworks for mutation', n)
|
||||
subprocess.call(['foldx'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mMUTATION:\033[0m', n)
|
||||
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
|
||||
#FIXME: bad design and needs to be done in a pythonic way
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
|
||||
print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
|
||||
# FIXME: this is bad design and needs to be done in a pythonic way
|
||||
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
|
||||
print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
if comp=='y':
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
|
||||
chain1=chainA
|
||||
chain2=chainB
|
||||
subprocess.call(['foldx'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
|
||||
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
shutil.copyfile(ac_source, ac_dest)
|
||||
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
|
||||
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
|
||||
subprocess.call(['foldx'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
|
||||
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
|
||||
shutil.copyfile(ac_mut_source, ac_mut_dest)
|
||||
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
|
||||
print('\n==========================================================')
|
||||
|
||||
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
|
||||
dGdata = pd.read_csv(dGdatafile, sep = '\t')
|
||||
|
||||
ddG=[]
|
||||
print('ddG')
|
||||
print(len(dGdata))
|
||||
for i in range(0,len(dGdata)):
|
||||
ddG.append(dGdata['total energy'].loc[i])
|
||||
|
||||
|
||||
nint = len(interactions)
|
||||
wt_int = []
|
||||
|
||||
for i in interactions:
|
||||
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
|
||||
wt_int.append(getInteractions(filename))
|
||||
print('wt')
|
||||
print(wt_int)
|
||||
|
||||
ntotal = nint+1
|
||||
print(ntotal)
|
||||
print(nmuts)
|
||||
data = np.empty((ntotal,nmuts))
|
||||
data[0] = ddG
|
||||
print(data)
|
||||
for i in range(0,len(interactions)):
|
||||
d=[]
|
||||
p=0
|
||||
for n in range(1, nmuts+1):
|
||||
print(i)
|
||||
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
|
||||
mut = getInteractions(filename)
|
||||
diff = wt_int[i] - mut
|
||||
print(diff)
|
||||
print(wt_int[i])
|
||||
print(mut)
|
||||
d.append(diff)
|
||||
print(d)
|
||||
data[i+1] = d
|
||||
|
||||
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
print(interactions)
|
||||
|
||||
IE = []
|
||||
if comp=='y':
|
||||
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
wtE = getInteractionEnergy(wtfilename)
|
||||
print(wtE)
|
||||
for n in range(1,nmuts+1):
|
||||
print(n)
|
||||
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
|
||||
mutE = getInteractionEnergy(filename)
|
||||
print(mutE)
|
||||
diff = wtE - mutE
|
||||
print(diff)
|
||||
IE.append(diff)
|
||||
print(IE)
|
||||
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
|
||||
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
|
||||
IEresults.to_csv(IEfilename)
|
||||
print(len(IE))
|
||||
data = np.append(data,[IE], axis = 0)
|
||||
print(data)
|
||||
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
|
||||
|
||||
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
mutlist = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
mutlist.append(mut)
|
||||
print(mutlist)
|
||||
print(len(mutlist))
|
||||
print(data)
|
||||
results = pd.DataFrame(data, columns = mutlist, index = interactions)
|
||||
results.append(ddG)
|
||||
#print(results.head())
|
||||
|
||||
# my style formatted results
|
||||
results2 = results.T # transpose df
|
||||
results2.index.name = 'mutationinformation' # assign name to index
|
||||
results2 = results2.reset_index() # turn it into a columns
|
||||
|
||||
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
|
||||
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
|
||||
|
||||
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
|
||||
|
||||
# lower case columns
|
||||
results2.columns = results2.columns.str.lower()
|
||||
|
||||
print('Writing file in the format below:\n'
|
||||
, results2.head()
|
||||
, '\nNo. of rows:', len(results2)
|
||||
, '\nNo. of cols:', len(results2.columns))
|
||||
|
||||
outputfilename = outfile_foldx
|
||||
#outputfilename = 'foldx_results_' + pdbname + '.csv'
|
||||
#results.to_csv(outputfilename)
|
||||
results2.to_csv(outputfilename, index = False)
|
||||
print ('end')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
466
foldx/runFoldx5.py
Executable file
466
foldx/runFoldx5.py
Executable file
|
@ -0,0 +1,466 @@
|
|||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
import re
|
||||
import csv
|
||||
import argparse
|
||||
import shutil
|
||||
import time
|
||||
#https://realpython.com/python-pathlib/
|
||||
|
||||
# FIXME
|
||||
#strong dependency of file and path names
|
||||
#cannot pass file with path. Need to pass them separately
|
||||
#assumptions made for dir struc as standard
|
||||
#datadir + drug + input
|
||||
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
|
||||
#os.getcwd()
|
||||
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
|
||||
|
||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
|
||||
|
||||
arg_parser.add_argument('-P', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
|
||||
|
||||
# FIXME: Doesn't work with 2 chains yet!
|
||||
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
|
||||
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
#gene_match = gene + '_p.'
|
||||
#%%=====================================================================
|
||||
# Command line options
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
process_dir = args.process_dir
|
||||
|
||||
mut_filename = args.mutation_file
|
||||
chainA = args.chain1
|
||||
chainB = args.chain2
|
||||
pdb_filename = args.pdb_file
|
||||
|
||||
|
||||
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
||||
#pdb_name = os.path.splitext(pdb_file)[0]
|
||||
# Just the filename, thanks
|
||||
#pdb_name = Path(in_filename_pdb).stem
|
||||
|
||||
|
||||
# Handle the case where neither 'drug'
|
||||
# nor (indir,outdir,process_dir) are defined
|
||||
if not drug:
|
||||
if not indir or not outdir or not process_dir:
|
||||
print('ERROR: if "drug" is not specified, you must specify Input, Output, and Process directories')
|
||||
sys.exit()
|
||||
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
if not datadir:
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
|
||||
if not process_dir:
|
||||
process_dir = datadir + '/' + drug + '/processing'
|
||||
|
||||
# Make all paths absolute in case the user forgot
|
||||
indir = os.path.abspath(indir)
|
||||
process_dir = os.path.abspath(process_dir)
|
||||
outdir = os.path.abspath(outdir)
|
||||
datadir = os.path.abspath(datadir)
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
# FIXME
|
||||
if pdb_filename:
|
||||
pdb_filename = os.path.abspath(pdb_filename)
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
infile_pdb = pdb_filename
|
||||
else:
|
||||
pdb_filename = gene.lower() + '_complex.pdb'
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
infile_pdb = indir + '/' + pdb_filename
|
||||
|
||||
actual_pdb_filename = Path(infile_pdb).name
|
||||
|
||||
if mut_filename:
|
||||
mutation_file = os.path.abspath(mut_filename)
|
||||
infile_muts = mutation_file
|
||||
print('User-provided mutation file in use:', infile_muts)
|
||||
else:
|
||||
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
infile_muts = outdir + '/' + mutation_file
|
||||
print('WARNING: Assuming default mutation file:', infile_muts)
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
out_filename = gene.lower() + '_foldx.csv'
|
||||
outfile_foldx = outdir + '/' + out_filename
|
||||
|
||||
print('Arguments being passed:'
|
||||
, '\nDrug:', args.drug
|
||||
, '\ngene:', args.gene
|
||||
, '\ninput dir:', indir
|
||||
, '\nprocess dir:', process_dir
|
||||
, '\noutput dir:', outdir
|
||||
, '\npdb file:', infile_pdb
|
||||
, '\npdb name:', pdb_name
|
||||
, '\nactual pdb name:', actual_pdb_filename
|
||||
, '\nmutation file:', infile_muts
|
||||
, '\nchain1:', args.chain1
|
||||
, '\noutput file:', outfile_foldx
|
||||
, '\n=============================================================')
|
||||
|
||||
#### Delay for 10 seconds to check the params ####
|
||||
print('Sleeping for 10 seconds to give you time to cancel')
|
||||
time.sleep(10)
|
||||
#=======================================================================
|
||||
|
||||
def getInteractionEnergy(filename):
|
||||
data = pd.read_csv(filename,sep = '\t')
|
||||
return data['Interaction Energy'].loc[0]
|
||||
|
||||
def getInteractions(filename):
|
||||
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
|
||||
contactList = getIndexes(data,1)
|
||||
number = len(contactList)
|
||||
return number
|
||||
|
||||
def formatMuts(mut_file,pdbname):
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
muts = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
muts.append(mut)
|
||||
|
||||
mut_list = []
|
||||
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(outfile, 'w') as output:
|
||||
for m in muts:
|
||||
print(m)
|
||||
mut = m[:1] + chainA+ m[1:]
|
||||
mut_list.append(mut)
|
||||
mut = mut + ';'
|
||||
print(mut)
|
||||
output.write(mut)
|
||||
output.write('\n')
|
||||
return mut_list
|
||||
|
||||
def getIndexes(data, value):
|
||||
colnames = data.columns.values
|
||||
listOfPos = list()
|
||||
result = data.isin([value])
|
||||
result.columns = colnames
|
||||
seriesdata = result.any()
|
||||
columnNames = list(seriesdata[seriesdata==True].index)
|
||||
for col in columnNames:
|
||||
rows = list(result[col][result[col]==True].index)
|
||||
|
||||
for row in rows:
|
||||
listOfPos.append((row,col))
|
||||
|
||||
return listOfPos
|
||||
|
||||
def loadFiles(df):
|
||||
# load a text file in to np matrix
|
||||
resultList = []
|
||||
f = open(df,'r')
|
||||
for line in f:
|
||||
line = line.rstrip('\n')
|
||||
aVals = line.split('\t')
|
||||
fVals = list(map(np.float32, sVals))
|
||||
resultList.append(fVals)
|
||||
f.close()
|
||||
return np.asarray(resultList, dtype=np.float32)
|
||||
|
||||
# TODO: put the subprocess call in a 'def'
|
||||
#def repairPDB():
|
||||
# subprocess.call(['foldx'
|
||||
# , '--command=RepairPDB'
|
||||
# , '--pdb-dir=' + indir
|
||||
# , '--pdb=' + actual_pdb_filename
|
||||
# , '--ionStrength=0.05'#
|
||||
# , '--pH=7'
|
||||
# , '--water=PREDICT'
|
||||
# , '--vdwDesign=1'
|
||||
# , 'outPDB=true'
|
||||
# , '--output-dir=' + process_dir])
|
||||
|
||||
#=======================================================================
|
||||
def main():
|
||||
pdbname = pdb_name
|
||||
comp = '' # for complex only
|
||||
mut_filename = infile_muts #pnca_mcsm_snps.csv
|
||||
mutlist = formatMuts(mut_filename, pdbname)
|
||||
|
||||
print(mutlist)
|
||||
nmuts = len(mutlist)
|
||||
print(nmuts)
|
||||
print(mutlist)
|
||||
|
||||
print('start')
|
||||
# some common parameters for foldX
|
||||
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
|
||||
|
||||
print('\033[95mSTAGE: repair PDB (foldx subprocess) \033[0m')
|
||||
print('Running foldx RepairPDB for WT')
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=RepairPDB'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + os.path.dirname(pdb_filename)
|
||||
, '--pdb=' + actual_pdb_filename
|
||||
, 'outPDB=true'
|
||||
, '--output-dir=' + process_dir])
|
||||
print('\033[95mCOMPLETED STAGE: repair PDB\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Foldx commands BM, PN and SD (foldx subprocess) for WT\033[0m')
|
||||
print('Running foldx BuildModel for WT')
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=BuildModel'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--mutant-file="individual_list_' + pdbname +'.txt"'
|
||||
, 'outPDB=true'
|
||||
, '--numberOfRuns=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
print('Running foldx PrintNetworks for WT')
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
print('Running foldx SequenceDetail for WT')
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=SequenceDetail'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
print('\033[95mCOMPLETED STAGE: Foldx commands BM, PN and SD\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mNETWORK:\033[0m', n)
|
||||
print('Running foldx PrintNetworks for mutation', n)
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
print('\033[95mCOMPLETED STAGE: Print Networks (foldx subprocess) for MT\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mMUTATION:\033[0m', n)
|
||||
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
|
||||
#FIXME: bad design and needs to be done in a pythonic way
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
|
||||
print('\033[95mCOMPLETED STAGE: Rename Mutation Files (shell)\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
print('\033[95mSTAGE: Rename Files (shell) for WT\033[0m')
|
||||
# FIXME: this is bad design and needs to be done in a pythonic way
|
||||
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
|
||||
print('\033[95mCOMPLETED STAGE: Rename Files (shell) for WT\033[0m')
|
||||
print('\n==========================================================')
|
||||
|
||||
|
||||
if comp=='y':
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for WT\033[0m')
|
||||
chain1=chainA
|
||||
chain2=chainB
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
|
||||
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
shutil.copyfile(ac_source, ac_dest)
|
||||
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for WT:\033[0m', n)
|
||||
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (foldx subprocess) for mutation:\033[0m', n)
|
||||
subprocess.call(['foldx5'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
|
||||
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
|
||||
shutil.copyfile(ac_mut_source, ac_mut_dest)
|
||||
print('\033[95mCOMPLETED STAGE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
|
||||
print('\n==========================================================')
|
||||
|
||||
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
|
||||
dGdata = pd.read_csv(dGdatafile, sep = '\t')
|
||||
|
||||
ddG=[]
|
||||
print('ddG')
|
||||
print(len(dGdata))
|
||||
for i in range(0,len(dGdata)):
|
||||
ddG.append(dGdata['total energy'].loc[i])
|
||||
|
||||
|
||||
nint = len(interactions)
|
||||
wt_int = []
|
||||
|
||||
for i in interactions:
|
||||
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
|
||||
wt_int.append(getInteractions(filename))
|
||||
print('wt')
|
||||
print(wt_int)
|
||||
|
||||
ntotal = nint+1
|
||||
print(ntotal)
|
||||
print(nmuts)
|
||||
data = np.empty((ntotal,nmuts))
|
||||
data[0] = ddG
|
||||
print(data)
|
||||
for i in range(0,len(interactions)):
|
||||
d=[]
|
||||
p=0
|
||||
for n in range(1, nmuts+1):
|
||||
print(i)
|
||||
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
|
||||
mut = getInteractions(filename)
|
||||
diff = wt_int[i] - mut
|
||||
print(diff)
|
||||
print(wt_int[i])
|
||||
print(mut)
|
||||
d.append(diff)
|
||||
print(d)
|
||||
data[i+1] = d
|
||||
|
||||
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
print(interactions)
|
||||
|
||||
IE = []
|
||||
if comp=='y':
|
||||
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
wtE = getInteractionEnergy(wtfilename)
|
||||
print(wtE)
|
||||
for n in range(1,nmuts+1):
|
||||
print(n)
|
||||
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
|
||||
mutE = getInteractionEnergy(filename)
|
||||
print(mutE)
|
||||
diff = wtE - mutE
|
||||
print(diff)
|
||||
IE.append(diff)
|
||||
print(IE)
|
||||
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
|
||||
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
|
||||
IEresults.to_csv(IEfilename)
|
||||
print(len(IE))
|
||||
data = np.append(data,[IE], axis = 0)
|
||||
print(data)
|
||||
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
|
||||
|
||||
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
mutlist = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
mutlist.append(mut)
|
||||
print(mutlist)
|
||||
print(len(mutlist))
|
||||
print(data)
|
||||
results = pd.DataFrame(data, columns = mutlist, index = interactions)
|
||||
results.append(ddG)
|
||||
#print(results.head())
|
||||
|
||||
# my style formatted results
|
||||
results2 = results.T # transpose df
|
||||
results2.index.name = 'mutationinformation' # assign name to index
|
||||
results2 = results2.reset_index() # turn it into a columns
|
||||
|
||||
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
|
||||
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
|
||||
|
||||
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
|
||||
|
||||
# lower case columns
|
||||
results2.columns = results2.columns.str.lower()
|
||||
|
||||
print('Writing file in the format below:\n'
|
||||
, results2.head()
|
||||
, '\nNo. of rows:', len(results2)
|
||||
, '\nNo. of cols:', len(results2.columns))
|
||||
|
||||
outputfilename = outfile_foldx
|
||||
#outputfilename = 'foldx_results_' + pdbname + '.csv'
|
||||
#results.to_csv(outputfilename)
|
||||
results2.to_csv(outputfilename, index = False)
|
||||
print ('end')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
10
foldx/test2/deprecated_shell/mutruncomplex.sh
Executable file
10
foldx/test2/deprecated_shell/mutruncomplex.sh
Executable file
|
@ -0,0 +1,10 @@
|
|||
PDB=$1
|
||||
A=$2
|
||||
B=$3
|
||||
n=$4
|
||||
OUTDIR=$5
|
||||
cd ${OUTDIR}
|
||||
logger "Running mutruncomplex"
|
||||
foldx --command=AnalyseComplex --pdb="${PDB}_Repair_${n}.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1
|
||||
cp ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
|
||||
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_${n}_AC.txt
|
9
foldx/test2/deprecated_shell/repairPDB.sh
Executable file
9
foldx/test2/deprecated_shell/repairPDB.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
INDIR=$1
|
||||
PDB=$2
|
||||
OUTDIR=$3
|
||||
cd ${OUTDIR}
|
||||
logger "Running repairPDB"
|
||||
|
||||
#foldx --command=RepairPDB --pdb="${PDB}.pdb" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
|
||||
|
||||
foldx --command=RepairPDB --pdb-dir=${INDIR} --pdb=${PDB} --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 outPDB=true --output-dir=${OUTDIR}
|
7
foldx/test2/deprecated_shell/runPrintNetworks.sh
Executable file
7
foldx/test2/deprecated_shell/runPrintNetworks.sh
Executable file
|
@ -0,0 +1,7 @@
|
|||
PDB=$1
|
||||
n=$2
|
||||
OUTDIR=$3
|
||||
logger "Running runPrintNetworks"
|
||||
cd ${OUTDIR}
|
||||
|
||||
foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
9
foldx/test2/deprecated_shell/runcomplex.sh
Executable file
9
foldx/test2/deprecated_shell/runcomplex.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
PDB=$1
|
||||
A=$2
|
||||
B=$3
|
||||
OUTDIR=$4
|
||||
cd ${OUTDIR}
|
||||
logger "Running runcomplex"
|
||||
foldx --command=AnalyseComplex --pdb="${PDB}_Repair.pdb" --analyseComplexChains=${A},${B} --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
||||
cp ${OUTDIR}/Summary_${PDB}_Repair_AC.fxout ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
|
||||
#sed -i .bak -e 1,8d ${OUTDIR}/Summary_${PDB}_Repair_AC.txt
|
9
foldx/test2/deprecated_shell/runfoldx.sh
Executable file
9
foldx/test2/deprecated_shell/runfoldx.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
PDB=$1
|
||||
OUTDIR=$2
|
||||
cd ${OUTDIR}
|
||||
pwd
|
||||
ls -l
|
||||
logger "Running runfoldx"
|
||||
foldx --command=BuildModel --pdb="${PDB}_Repair.pdb" --mutant-file="individual_list_${PDB}.txt" --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 --out-pdb=true --numberOfRuns=1 --output-dir=${OUTDIR}
|
||||
foldx --command=PrintNetworks --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
||||
foldx --command=SequenceDetail --pdb="${PDB}_Repair.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
2
foldx/test2/gid_test_snps.csv
Normal file
2
foldx/test2/gid_test_snps.csv
Normal file
|
@ -0,0 +1,2 @@
|
|||
S2C
|
||||
S2F
|
|
63
foldx/test2/mutrenamefiles.sh
Executable file
63
foldx/test2/mutrenamefiles.sh
Executable file
|
@ -0,0 +1,63 @@
|
|||
PDB=$1
|
||||
n=$2
|
||||
OUTDIR=$3
|
||||
cd ${OUTDIR}
|
||||
#cd /home/git/LSHTM_analysis/foldx/test2
|
||||
cp Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout Matrix_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_${n}_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Distances_${PDB}_Repair_${n}_PN.fxout Matrix_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout Matrix_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_${n}_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Electro_${PDB}_Repair_${n}_PN.fxout Matrix_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_${n}_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout Matrix_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_${n}_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout Matrix_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_${n}_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout Matrix_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_${n}_PN.txt
|
||||
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_${n}_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Electro_${PDB}_Repair_${n}_PN.fxout AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Partcov_${PDB}_Repair_${n}_PN.fxout AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
cp AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Distances_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Electro_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_${n}_PN.txt
|
||||
cp InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_${n}_PN.txt
|
64
foldx/test2/renamefiles.sh
Executable file
64
foldx/test2/renamefiles.sh
Executable file
|
@ -0,0 +1,64 @@
|
|||
PDB=$1
|
||||
OUTDIR=$2
|
||||
cd ${OUTDIR}
|
||||
#cd /home/git/LSHTM_analysis/foldx/test2
|
||||
cp Dif_${PDB}_Repair.fxout Dif_${PDB}_Repair.txt
|
||||
sed -i '1,8d' Dif_${PDB}_Repair.txt
|
||||
cp Matrix_Hbonds_${PDB}_Repair_PN.fxout Matrix_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Hbonds_${PDB}_Repair_PN.fxout > Matrix_Hbonds_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Distances_${PDB}_Repair_PN.fxout Matrix_Distances_${PDB}_Repair_PN.txt
|
||||
sed -i '1,4d' Matrix_Distances_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Volumetric_${PDB}_Repair_PN.fxout Matrix_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Volumetric_${PDB}_Repair_PN.fxout > Matrix_Volumetric_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Electro_${PDB}_Repair_PN.fxout Matrix_Electro_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Electro_${PDB}_Repair_PN.fxout > Matrix_Electro_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Disulfide_${PDB}_Repair_PN.fxout Matrix_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Disulfide_${PDB}_Repair_PN.fxout > Matrix_Disulfide_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_Partcov_${PDB}_Repair_PN.fxout Matrix_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_Partcov_${PDB}_Repair_PN.fxout > Matrix_Partcov_SS_${PDB}_Repair_PN.txt
|
||||
cp Matrix_VdWClashes_${PDB}_Repair_PN.fxout Matrix_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -n '5,190p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_RR_${PDB}_Repair_PN.txt
|
||||
sed -n '194,379p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_MM_${PDB}_Repair_PN.txt
|
||||
sed -n '383,568p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SM_${PDB}_Repair_PN.txt
|
||||
sed -n '572,757p' Matrix_VdWClashes_${PDB}_Repair_PN.fxout > Matrix_VdWClashes_SS_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Disulfide_${PDB}_Repair_PN.fxout AllAtoms_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Disulfide_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Electro_${PDB}_Repair_PN.fxout AllAtoms_Electro_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Electro_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Hbonds_${PDB}_Repair_PN.fxout AllAtoms_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Hbonds_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Partcov_${PDB}_Repair_PN.fxout AllAtoms_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Partcov_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_VdWClashes_${PDB}_Repair_PN.fxout AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_VdWClashes_${PDB}_Repair_PN.txt
|
||||
cp AllAtoms_Volumetric_${PDB}_Repair_PN.fxout AllAtoms_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -i '1,2d' AllAtoms_Volumetric_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_VdWClashes_${PDB}_Repair_PN.fxout InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_VdWClashes_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Distances_${PDB}_Repair_PN.fxout InteractingResidues_Distances_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Distances_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Electro_${PDB}_Repair_PN.fxout InteractingResidues_Electro_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Electro_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Hbonds_${PDB}_Repair_PN.fxout InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Hbonds_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Partcov_${PDB}_Repair_PN.fxout InteractingResidues_Partcov_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Partcov_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Volumetric_${PDB}_Repair_PN.fxout InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Volumetric_${PDB}_Repair_PN.txt
|
||||
cp InteractingResidues_Disulfide_${PDB}_Repair_PN.fxout InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
|
||||
sed -i '1,5d' InteractingResidues_Disulfide_${PDB}_Repair_PN.txt
|
239965
foldx/test2/rotabase.txt
Normal file
239965
foldx/test2/rotabase.txt
Normal file
File diff suppressed because it is too large
Load diff
1
foldx/test2/runFoldx.py
Symbolic link
1
foldx/test2/runFoldx.py
Symbolic link
|
@ -0,0 +1 @@
|
|||
../runFoldx.py
|
250
foldx/test2/runFoldx_test.py
Executable file
250
foldx/test2/runFoldx_test.py
Executable file
|
@ -0,0 +1,250 @@
|
|||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from contextlib import suppress
|
||||
import re
|
||||
import csv
|
||||
|
||||
def getInteractions(filename):
|
||||
data = pd.read_csv(filename, index_col=0, header =0, sep="\t")
|
||||
contactList = getIndexes(data,1)
|
||||
print(contactList)
|
||||
number = len(contactList)
|
||||
return number
|
||||
|
||||
def formatMuts(mut_file,pdbname):
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
muts = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
muts.append(mut)
|
||||
|
||||
mut_list = []
|
||||
outfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/individual_list_"+pdbname+".txt"
|
||||
with open(outfile, "w") as output:
|
||||
for m in muts:
|
||||
print(m)
|
||||
mut = m[:1]+'A'+m[1:]
|
||||
mut_list.append(mut)
|
||||
mut = mut + ";"
|
||||
print(mut)
|
||||
output.write(mut)
|
||||
output.write("\n")
|
||||
return mut_list
|
||||
|
||||
def getIndexes(data, value):
|
||||
colnames = data.columns.values
|
||||
listOfPos = list()
|
||||
result = data.isin([value])
|
||||
result.columns=colnames
|
||||
seriesdata = result.any()
|
||||
columnNames = list(seriesdata[seriesdata==True].index)
|
||||
for col in columnNames:
|
||||
rows = list(result[col][result[col]==True].index)
|
||||
|
||||
for row in rows:
|
||||
listOfPos.append((row,col))
|
||||
|
||||
return listOfPos
|
||||
|
||||
def loadFiles(df):
|
||||
# load a text file in to np matrix
|
||||
resultList = []
|
||||
f = open(df,'r')
|
||||
for line in f:
|
||||
line = line.rstrip('\n')
|
||||
aVals = line.split("\t")
|
||||
fVals = list(map(np.float32, sVals))
|
||||
resultList.append(fVals)
|
||||
f.close()
|
||||
return np.asarray(resultList, dtype=np.float32)
|
||||
|
||||
#=======================================================================
|
||||
def main():
|
||||
pdbname = '3pl1'
|
||||
mut_filename = "pnca_muts_sample.csv"
|
||||
mutlist = formatMuts(mut_filename, pdbname)
|
||||
|
||||
print(mutlist)
|
||||
nmuts = len(mutlist)+1
|
||||
print(nmuts)
|
||||
print(mutlist)
|
||||
print("start")
|
||||
|
||||
output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname])
|
||||
print("end")
|
||||
for n in range(1,nmuts):
|
||||
print(n)
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname,str(n)])
|
||||
|
||||
for n in range(1,nmuts):
|
||||
print(n)
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname,str(n)])
|
||||
|
||||
|
||||
out = subprocess.check_output(['bash','renamefiles.sh',pdbname])
|
||||
|
||||
dGdatafile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Dif_"+pdbname+"_Repair.txt"
|
||||
dGdata = pd.read_csv(dGdatafile, sep="\t")
|
||||
print(dGdata)
|
||||
ddG=[]
|
||||
for i in range(0,len(dGdata)):
|
||||
ddG.append(dGdata['total energy'].loc[i])
|
||||
print(ddG)
|
||||
distfile = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nc = getInteractions(distfile)
|
||||
|
||||
elecfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_PN.txt"
|
||||
wt_neRR = getInteractions(elecfileRR)
|
||||
|
||||
elecfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_MM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_neMM = getInteractions(elecfileMM)
|
||||
|
||||
elecfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_neSM = getInteractions(elecfileSM)
|
||||
|
||||
elecfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_SS_"+pdbname+"_Repair_PN.txt"
|
||||
wt_neSS = getInteractions(elecfileSS)
|
||||
|
||||
disufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_PN.txt"
|
||||
wt_ndRR = getInteractions(disufileRR)
|
||||
|
||||
disufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_MM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_ndMM = getInteractions(disufileMM)
|
||||
|
||||
disufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_ndSM = getInteractions(disufileSM)
|
||||
|
||||
disufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_SS_"+pdbname+"_Repair_PN.txt"
|
||||
wt_ndSS = getInteractions(disufileSS)
|
||||
|
||||
hbndfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nhRR = getInteractions(hbndfileRR)
|
||||
|
||||
hbndfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_MM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nhMM = getInteractions(hbndfileMM)
|
||||
|
||||
hbndfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nhSM = getInteractions(hbndfileSM)
|
||||
|
||||
hbndfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_SS_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nhSS = getInteractions(hbndfileSS)
|
||||
|
||||
partfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_PN.txt"
|
||||
wt_npRR = getInteractions(partfileRR)
|
||||
|
||||
partfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_MM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_npMM = getInteractions(partfileMM)
|
||||
|
||||
partfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_npSM = getInteractions(partfileSM)
|
||||
|
||||
partfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_SS_"+pdbname+"_Repair_PN.txt"
|
||||
wt_npSS = getInteractions(partfileSS)
|
||||
|
||||
vdwcfileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvRR = getInteractions(vdwcfileRR)
|
||||
|
||||
vdwcfileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_MM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvMM = getInteractions(vdwcfileMM)
|
||||
|
||||
vdwcfileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvSM = getInteractions(vdwcfileSM)
|
||||
|
||||
vdwcfileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_SS_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvSS = getInteractions(vdwcfileSS)
|
||||
|
||||
volufileRR = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvoRR = getInteractions(volufileRR)
|
||||
|
||||
volufileMM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_MM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvoMM = getInteractions(volufileMM)
|
||||
|
||||
volufileSM = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SM_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvoSM = getInteractions(volufileSM)
|
||||
|
||||
volufileSS = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_SS_"+pdbname+"_Repair_PN.txt"
|
||||
wt_nvoSS = getInteractions(volufileSS)
|
||||
|
||||
dnc = []
|
||||
dneRR = []
|
||||
dneMM = []
|
||||
dneSM = []
|
||||
dneSS = []
|
||||
dndRR = []
|
||||
dndMM = []
|
||||
dndSM = []
|
||||
dndSS = []
|
||||
dnhRR = []
|
||||
dnhMM = []
|
||||
dnhSM = []
|
||||
dnhSS = []
|
||||
dnpRR = []
|
||||
dnpMM = []
|
||||
dnpSM = []
|
||||
dnpSS = []
|
||||
dnvRR = []
|
||||
dnvMM = []
|
||||
dnvSM = []
|
||||
dnvSS = []
|
||||
dnvoRR = []
|
||||
dnvoMM = []
|
||||
dnvoSM = []
|
||||
dnvoSS = []
|
||||
for n in range(1, nmuts):
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Distances_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_nc = getInteractions(filename)
|
||||
diffc = wt_nc - mut_nc
|
||||
dnc.append(diffc)
|
||||
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Electro_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_neRR = getInteractions(filename)
|
||||
diffeRR = wt_neRR - mut_neRR
|
||||
dneRR.append(diffeRR)
|
||||
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Disulfide_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_ndRR = getInteractions(filename)
|
||||
diffdRR = wt_ndRR - mut_ndRR
|
||||
dndRR.append(diffdRR)
|
||||
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Hbonds_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_nhRR = getInteractions(filename)
|
||||
diffhRR = wt_nhRR - mut_nhRR
|
||||
dnhRR.append(diffhRR)
|
||||
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Partcov_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_npRR = getInteractions(filename)
|
||||
diffpRR = wt_npRR - mut_npRR
|
||||
dnpRR.append(diffpRR)
|
||||
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_VdWClashes_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_nvRR = getInteractions(filename)
|
||||
diffvRR = wt_nvRR - mut_nvRR
|
||||
dnvRR.append(diffvRR)
|
||||
|
||||
filename = "/home/tanu/git/LSHTM_analysis/foldx/test2/Matrix_Volumetric_RR_"+pdbname+"_Repair_" + str(n)+"_PN.txt"
|
||||
mut_nvoRR = getInteractions(filename)
|
||||
diffvoRR = wt_nvoRR - mut_nvoRR
|
||||
dnvoRR.append(diffvoRR)
|
||||
print(dnc)
|
||||
print(dneRR)
|
||||
print(dndRR)
|
||||
print(dnhRR)
|
||||
print(dnpRR)
|
||||
print(dnvRR)
|
||||
print(dnvoRR)
|
||||
|
||||
results = pd.DataFrame([(ddG),(dnc),(dneRR),(dndRR),(dnhRR),(dnpRR),(dnvRR),(dnvoRR)], columns=mutlist, index=["ddG","contacts","electro","disulfide","hbonds","partcov","VdWClashes","volumetric"])
|
||||
results.append(ddG)
|
||||
print(results)
|
||||
results2 = results.T # transpose df
|
||||
outputfilename = "foldx_results_"+pdbname+".csv"
|
||||
# results.to_csv(outputfilename)
|
||||
results2.to_csv(outputfilename)
|
||||
if __name__ == "__main__":
|
||||
main()
|
456
foldx/test2/runFoldx_test2.py
Executable file
456
foldx/test2/runFoldx_test2.py
Executable file
|
@ -0,0 +1,456 @@
|
|||
#!/usr/bin/env python3
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
import re
|
||||
import csv
|
||||
import argparse
|
||||
import shutil
|
||||
#https://realpython.com/python-pathlib/
|
||||
|
||||
# FIXME
|
||||
#strong dependency of file and path names
|
||||
#cannot pass file with path. Need to pass them separately
|
||||
#assumptions made for dir struc as standard
|
||||
#datadir + drug + input
|
||||
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/foldx/')
|
||||
#os.getcwd()
|
||||
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
|
||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
|
||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None)
|
||||
|
||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('-p', '--process_dir', help = 'Temp processing dir for running foldX. By default, it assmes homedir + <drug> + processing. Make sure it is somewhere with LOTS of storage as it writes all output!') #FIXME
|
||||
|
||||
arg_parser.add_argument('-pdb', '--pdb_file', help = 'PDB File to process. By default, it assmumes a file called <gene>_complex.pdb in input_dir')
|
||||
arg_parser.add_argument('-m', '--mutation_file', help = 'Mutation list. By default, assumes a file called <gene>_mcsm_snps.csv exists')
|
||||
|
||||
# FIXME: Doesn't work with 2 chains yet!
|
||||
arg_parser.add_argument('-c1', '--chain1', help = 'Chain1 ID', default = 'A') # case sensitive
|
||||
arg_parser.add_argument('-c2', '--chain2', help = 'Chain2 ID', default = 'B') # case sensitive
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
#gene_match = gene + '_p.'
|
||||
#%%=====================================================================
|
||||
# Command line options
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
process_dir = args.process_dir
|
||||
|
||||
mut_filename = args.mutation_file
|
||||
chainA = args.chain1
|
||||
chainB = args.chain2
|
||||
pdb_filename = args.pdb_file
|
||||
|
||||
# os.path.splitext will fail interestingly with file.pdb.txt.zip
|
||||
#pdb_name = os.path.splitext(pdb_file)[0]
|
||||
# Just the filename, thanks
|
||||
#pdb_name = Path(in_filename_pdb).stem
|
||||
|
||||
#==============
|
||||
# directories
|
||||
#==============
|
||||
if not datadir:
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#TODO: perhaps better handled by refactoring code to prevent generating lots of output files!
|
||||
#if not process_dir:
|
||||
# process_dir = datadir + '/' + drug + '/processing'
|
||||
|
||||
# Make all paths absolute in case the user forgot
|
||||
indir = os.path.abspath(indir)
|
||||
process_dir = os.path.abspath(process_dir)
|
||||
outdir = os.path.abspath(outdir)
|
||||
datadir = os.path.abspath(datadir)
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
# FIXME
|
||||
if pdb_filename:
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
else:
|
||||
pdb_filename = gene.lower() + '_complex.pdb'
|
||||
pdb_name = Path(pdb_filename).stem
|
||||
|
||||
infile_pdb = indir + '/' + pdb_filename
|
||||
actual_pdb_filename = Path(infile_pdb).name
|
||||
#actual_pdb_filename = os.path.abspath(infile_pdb)
|
||||
|
||||
if mut_filename:
|
||||
mutation_file = os.path.abspath(mut_filename)
|
||||
infile_muts = mutation_file
|
||||
print('User-provided mutation file in use:', infile_muts)
|
||||
else:
|
||||
mutation_file = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
infile_muts = outdir + '/' + mutation_file
|
||||
print('WARNING: Assuming default mutation file:', infile_muts)
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
out_filename = gene.lower() + '_foldx.csv'
|
||||
outfile_foldx = outdir + '/' + out_filename
|
||||
|
||||
print('Arguments being passed:'
|
||||
, '\nDrug:', args.drug
|
||||
, '\ngene:', args.gene
|
||||
, '\ninput dir:', indir
|
||||
, '\nprocess dir:', process_dir
|
||||
, '\noutput dir:', outdir
|
||||
, '\npdb file:', infile_pdb
|
||||
, '\npdb name:', pdb_name
|
||||
, '\nactual pdb name:', actual_pdb_filename
|
||||
, '\nmutation file:', infile_muts
|
||||
, '\nchain1:', args.chain1
|
||||
, '\noutput file:', outfile_foldx
|
||||
, '\n=============================================================')
|
||||
#=======================================================================
|
||||
|
||||
def getInteractionEnergy(filename):
|
||||
data = pd.read_csv(filename,sep = '\t')
|
||||
return data['Interaction Energy'].loc[0]
|
||||
|
||||
def getInteractions(filename):
|
||||
data = pd.read_csv(filename, index_col = 0, header = 0, sep = '\t')
|
||||
contactList = getIndexes(data,1)
|
||||
number = len(contactList)
|
||||
return number
|
||||
|
||||
def formatMuts(mut_file,pdbname):
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
muts = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
muts.append(mut)
|
||||
|
||||
mut_list = []
|
||||
outfile = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(outfile, 'w') as output:
|
||||
for m in muts:
|
||||
print(m)
|
||||
mut = m[:1] + chainA+ m[1:]
|
||||
mut_list.append(mut)
|
||||
mut = mut + ';'
|
||||
print(mut)
|
||||
output.write(mut)
|
||||
output.write('\n')
|
||||
return mut_list
|
||||
|
||||
def getIndexes(data, value):
|
||||
colnames = data.columns.values
|
||||
listOfPos = list()
|
||||
result = data.isin([value])
|
||||
result.columns = colnames
|
||||
seriesdata = result.any()
|
||||
columnNames = list(seriesdata[seriesdata==True].index)
|
||||
for col in columnNames:
|
||||
rows = list(result[col][result[col]==True].index)
|
||||
|
||||
for row in rows:
|
||||
listOfPos.append((row,col))
|
||||
|
||||
return listOfPos
|
||||
|
||||
def loadFiles(df):
|
||||
# load a text file in to np matrix
|
||||
resultList = []
|
||||
f = open(df,'r')
|
||||
for line in f:
|
||||
line = line.rstrip('\n')
|
||||
aVals = line.split('\t')
|
||||
fVals = list(map(np.float32, sVals))
|
||||
resultList.append(fVals)
|
||||
f.close()
|
||||
return np.asarray(resultList, dtype=np.float32)
|
||||
|
||||
# TODO: use this code pattern rather than invoking bash
|
||||
#def repairPDB():
|
||||
# subprocess.call(['foldx'
|
||||
# , '--command=RepairPDB'
|
||||
# , '--pdb-dir=' + indir
|
||||
# , '--pdb=' + actual_pdb_filename
|
||||
# , '--ionStrength=0.05'#
|
||||
# , '--pH=7'
|
||||
# , '--water=PREDICT'
|
||||
# , '--vdwDesign=1'
|
||||
# , 'outPDB=true'
|
||||
# , '--output-dir=' + process_dir])
|
||||
|
||||
#=======================================================================
|
||||
def main():
|
||||
pdbname = pdb_name
|
||||
comp = '' # for complex only
|
||||
mut_filename = infile_muts #pnca_mcsm_snps.csv
|
||||
mutlist = formatMuts(mut_filename, pdbname)
|
||||
|
||||
print(mutlist)
|
||||
nmuts = len(mutlist)
|
||||
print(nmuts)
|
||||
print(mutlist)
|
||||
print('start')
|
||||
#subprocess.check_output(['bash','repairPDB.sh', pdbname, process_dir])
|
||||
print('\033[95mSTAGE: repair PDB\033[0m')
|
||||
print('EXECUTING: repairPDB.sh %s %s %s' % (indir, actual_pdb_filename, process_dir))
|
||||
#subprocess.check_output(['bash','repairPDB.sh', indir, actual_pdb_filename, process_dir])
|
||||
# once you decide to use the function
|
||||
# repairPDB(pdbname)
|
||||
|
||||
# FIXME: put this hack elsewhere
|
||||
foldx_common=' --ionStrength=0.05 --pH=7 --water=PREDICT --vdwDesign=1 '
|
||||
|
||||
subprocess.call(['foldx'
|
||||
, '--command=RepairPDB'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + indir
|
||||
, '--pdb=' + actual_pdb_filename
|
||||
, 'outPDB=true'
|
||||
, '--output-dir=' + process_dir])
|
||||
print('\033[95mCOMPLETE: repair PDB\033[0m')
|
||||
print('\033[95mSTAGE: run FoldX (subprocess)\033[0m')
|
||||
print('EXECUTING: runfoldx.sh %s %s ' % (pdbname, process_dir))
|
||||
#output = subprocess.check_output(['bash', 'runfoldx.sh', pdbname, process_dir])
|
||||
|
||||
print('Running foldx BuildModel')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=BuildModel'
|
||||
, foldx_common
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--mutant-file="individual_list_' + pdbname +'.txt"'
|
||||
, 'outPDB=true'
|
||||
, '--numberOfRuns=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
print('Running foldx PrintNetworks')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
print('Running foldx SequenceDetail')
|
||||
subprocess.call(['foldx'
|
||||
, '--command=SequenceDetail'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
|
||||
print('\033[95mCOMPLETE: run FoldX (subprocess)\033[0m')
|
||||
|
||||
print('\033[95mSTAGE: Print Networks (shell)\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mNETWORK:\033[0m', n)
|
||||
#print('\033[96mCommand:\033[0m runPrintNetworks.sh %s %s %s' % (pdbname, str(n), process_dir ))
|
||||
#with suppress(Exception):
|
||||
#foldx --command=PrintNetworks --pdb="${PDB}_Repair_${n}.pdb" --water=PREDICT --vdwDesign=1 --output-dir=${OUTDIR}
|
||||
print('Running foldx PrintNetworks for mutation', n)
|
||||
subprocess.call(['foldx'
|
||||
, '--command=PrintNetworks'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
#subprocess.check_output(['bash', 'runPrintNetworks.sh', pdbname, str(n), process_dir])
|
||||
print('\033[95mCOMPLETE: Print Networks (shell)\033[0m')
|
||||
|
||||
print('\033[95mSTAGE: Rename Mutation Files (shell)\033[0m')
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mMUTATION:\033[0m', n)
|
||||
print('\033[96mCommand:\033[0m mutrenamefiles.sh %s %s %s' % (pdbname, str(n), process_dir ))
|
||||
# FIXME: this is bad design and needs to be done in a pythonic way
|
||||
with suppress(Exception):
|
||||
subprocess.check_output(['bash', 'mutrenamefiles.sh', pdbname, str(n), process_dir])
|
||||
print('\033[95mCOMPLETE: Rename Mutation Files (shell)\033[0m')
|
||||
|
||||
print('\033[95mSTAGE: Rename Files (shell)\033[0m')
|
||||
# FIXME: this is bad design and needs to be done in a pythonic way
|
||||
out = subprocess.check_output(['bash','renamefiles.sh', pdbname, process_dir])
|
||||
print('\033[95mCOMPLETE: Rename Files (shell)\033[0m')
|
||||
|
||||
if comp=='y':
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess)\033[0m')
|
||||
chain1=chainA
|
||||
chain2=chainB
|
||||
#with suppress(Exception):
|
||||
#subprocess.check_output(['bash','runcomplex.sh', pdbname, chain1, chain2, process_dir])
|
||||
subprocess.call(['foldx'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_source = process_dir + '/Summary_' + pdbname + '_Repair_AC.fxout'
|
||||
ac_dest = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
shutil.copyfile(ac_source, ac_dest)
|
||||
|
||||
for n in range(1,nmuts+1):
|
||||
print('\033[95mSTAGE: Running foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
|
||||
#with suppress(Exception):
|
||||
# subprocess.check_output(['bash','mutruncomplex.sh', pdbname, chain1, chain2, str(n), process_dir])
|
||||
subprocess.call(['foldx'
|
||||
, '--command=AnalyseComplex'
|
||||
, '--pdb-dir=' + process_dir
|
||||
, '--pdb=' + pdbname + '_Repair_' + str(n) + '.pdb'
|
||||
, '--analyseComplexChains=' + chain1 + ',' + chain2
|
||||
, '--water=PREDICT'
|
||||
, '--vdwDesign=1'
|
||||
, '--output-dir=' + process_dir], cwd=process_dir)
|
||||
|
||||
# FIXME why would we ever need to do this?!? Cargo-culted from runcomplex.sh
|
||||
ac_mut_source = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) +'_AC.fxout'
|
||||
ac_mut_dest = process_dir + '/Summary_' + pdbname + '_Repair)' + str(n) +'_AC.txt'
|
||||
shutil.copyfile(ac_mut_source, ac_mut_dest)
|
||||
print('\033[95mCOMPLETE: foldx AnalyseComplex (subprocess) for mutation:\033[0m', n)
|
||||
|
||||
interactions = ['Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS',
|
||||
'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM',
|
||||
'VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
dGdatafile = process_dir + '/Dif_' + pdbname + '_Repair.txt'
|
||||
dGdata = pd.read_csv(dGdatafile, sep = '\t')
|
||||
|
||||
ddG=[]
|
||||
print('ddG')
|
||||
print(len(dGdata))
|
||||
for i in range(0,len(dGdata)):
|
||||
ddG.append(dGdata['total energy'].loc[i])
|
||||
|
||||
|
||||
nint = len(interactions)
|
||||
wt_int = []
|
||||
|
||||
for i in interactions:
|
||||
filename = process_dir + '/Matrix_' + i + '_'+ pdbname + '_Repair_PN.txt'
|
||||
wt_int.append(getInteractions(filename))
|
||||
print('wt')
|
||||
print(wt_int)
|
||||
|
||||
ntotal = nint+1
|
||||
print(ntotal)
|
||||
print(nmuts)
|
||||
data = np.empty((ntotal,nmuts))
|
||||
data[0] = ddG
|
||||
print(data)
|
||||
for i in range(0,len(interactions)):
|
||||
d=[]
|
||||
p=0
|
||||
for n in range(1, nmuts+1):
|
||||
print(i)
|
||||
filename = process_dir + '/Matrix_' + interactions[i] + '_' + pdbname + '_Repair_' + str(n) + '_PN.txt'
|
||||
mut = getInteractions(filename)
|
||||
diff = wt_int[i] - mut
|
||||
print(diff)
|
||||
print(wt_int[i])
|
||||
print(mut)
|
||||
d.append(diff)
|
||||
print(d)
|
||||
data[i+1] = d
|
||||
|
||||
interactions = ['ddG', 'Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS', 'Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS']
|
||||
|
||||
print(interactions)
|
||||
|
||||
IE = []
|
||||
if comp=='y':
|
||||
wtfilename = process_dir + '/Summary_' + pdbname + '_Repair_AC.txt'
|
||||
wtE = getInteractionEnergy(wtfilename)
|
||||
print(wtE)
|
||||
for n in range(1,nmuts+1):
|
||||
print(n)
|
||||
filename = process_dir + '/Summary_' + pdbname + '_Repair_' + str(n) + '_AC.txt'
|
||||
mutE = getInteractionEnergy(filename)
|
||||
print(mutE)
|
||||
diff = wtE - mutE
|
||||
print(diff)
|
||||
IE.append(diff)
|
||||
print(IE)
|
||||
IEresults = pd.DataFrame(IE,columns = ['Interaction Energy'], index = mutlist)
|
||||
IEfilename = 'foldx_complexresults_'+pdbname+'.csv'
|
||||
IEresults.to_csv(IEfilename)
|
||||
print(len(IE))
|
||||
data = np.append(data,[IE], axis = 0)
|
||||
print(data)
|
||||
interactions = ['ddG','Distances','Electro_RR','Electro_MM','Electro_SM','Electro_SS','Disulfide_RR','Disulfide_MM','Disulfide_SM','Disulfide_SS','Hbonds_RR','Hbonds_MM','Hbonds_SM','Hbonds_SS','Partcov_RR','Partcov_MM','Partcov_SM','Partcov_SS','VdWClashes_RR','VdWClashes_MM','VdWClashes_SM','VdWClashes_SS','Volumetric_RR','Volumetric_MM','Volumetric_SM','Volumetric_SS','Interaction Energy']
|
||||
|
||||
mut_file = process_dir + '/individual_list_' + pdbname + '.txt'
|
||||
with open(mut_file) as csvfile:
|
||||
readCSV = csv.reader(csvfile)
|
||||
mutlist = []
|
||||
for row in readCSV:
|
||||
mut = row[0]
|
||||
mutlist.append(mut)
|
||||
print(mutlist)
|
||||
print(len(mutlist))
|
||||
print(data)
|
||||
results = pd.DataFrame(data, columns = mutlist, index = interactions)
|
||||
results.append(ddG)
|
||||
#print(results.head())
|
||||
|
||||
# my style formatted results
|
||||
results2 = results.T # transpose df
|
||||
results2.index.name = 'mutationinformation' # assign name to index
|
||||
results2 = results2.reset_index() # turn it into a columns
|
||||
|
||||
results2['mutationinformation'] = results2['mutationinformation'].replace({r'([A-Z]{1})[A-Z]{1}([0-9]+[A-Z]{1});' : r'\1 \2'}, regex = True) # capture mcsm style muts (i.e not the chain id)
|
||||
results2['mutationinformation'] = results2['mutationinformation'].str.replace(' ', '') # remove empty space
|
||||
|
||||
results2.rename(columns = {'Distances': 'Contacts'}, inplace = True)
|
||||
|
||||
# lower case columns
|
||||
results2.columns = results2.columns.str.lower()
|
||||
|
||||
print('Writing file in the format below:\n'
|
||||
, results2.head()
|
||||
, '\nNo. of rows:', len(results2)
|
||||
, '\nNo. of cols:', len(results2.columns))
|
||||
|
||||
outputfilename = outfile_foldx
|
||||
#outputfilename = 'foldx_results_' + pdbname + '.csv'
|
||||
#results.to_csv(outputfilename)
|
||||
results2.to_csv(outputfilename, index = False)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
3
foldx/test2/test2_output/gid_foldx.csv
Normal file
3
foldx/test2/test2_output/gid_foldx.csv
Normal file
|
@ -0,0 +1,3 @@
|
|||
mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
|
||||
S2C,0.30861700000000003,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
|
||||
S2F,-0.6481899999999999,-8.0,-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0
|
|
3
foldx/test2/test2_output/pnca_foldx.csv
Normal file
3
foldx/test2/test2_output/pnca_foldx.csv
Normal file
|
@ -0,0 +1,3 @@
|
|||
mutationinformation,ddg,contacts,electro_rr,electro_mm,electro_sm,electro_ss,disulfide_rr,disulfide_mm,disulfide_sm,disulfide_ss,hbonds_rr,hbonds_mm,hbonds_sm,hbonds_ss,partcov_rr,partcov_mm,partcov_sm,partcov_ss,vdwclashes_rr,vdwclashes_mm,vdwclashes_sm,vdwclashes_ss,volumetric_rr,volumetric_mm,volumetric_sm,volumetric_ss
|
||||
L4S,5.7629,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,4.0
|
||||
L159R,1.66524,-56.0,-26.0,0.0,-2.0,-24.0,0.0,0.0,0.0,0.0,-2.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-1.0,-4.0,0.0,-4.0,0.0
|
|
34
foldx/test2/testing_foldx_cmds
Normal file
34
foldx/test2/testing_foldx_cmds
Normal file
|
@ -0,0 +1,34 @@
|
|||
./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p /home/tanu/git/LSHTM_analysis/foldx/test2/test2_process -pdb 3pl1.pdb -m pnca_muts_sample.csv -c1 A
|
||||
|
||||
============
|
||||
# Example 1: pnca
|
||||
# Delete processing output, copy rotabase.txt and individual_list_3pl1.txt in place, run a test
|
||||
# get files from test/
|
||||
============
|
||||
#
|
||||
clear; rm -rf test2_process/*; cp individual_list_3pl1.txt test2_process/ ; cp rotabase.txt test2_process/; ./runFoldx_test2.py -g pncA --datadir /home/tanu/git/LSHTM_analysis/foldx/test2 -i /home/tanu/git/LSHTM_analysis/foldx/test2 -o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output -p ./test2_process -pdb 3pl1.pdb -m /tmp/pnca_test_muts.csv -c1 A
|
||||
|
||||
============
|
||||
# Example 2: gidb
|
||||
============
|
||||
clear
|
||||
rm Unrecognized_molecules.txt
|
||||
rm -rf test2_process/*
|
||||
cp rotabase.txt test2_process/
|
||||
|
||||
./runFoldx.py \
|
||||
-g gid \
|
||||
--datadir /home/tanu/git/LSHTM_analysis/foldx/test2 \
|
||||
-i /home/tanu/git/LSHTM_analysis/foldx/test2 \
|
||||
-o /home/tanu/git/LSHTM_analysis/foldx/test2/test2_output \
|
||||
-p ./test2_process \
|
||||
-pdb gid_test2.pdb \
|
||||
-m gid_test_snps.csv \
|
||||
-c1 A
|
||||
|
||||
|
||||
#==========
|
||||
clear dir
|
||||
#==========
|
||||
rm Unrecognized_molecules.txt
|
||||
find ~/git/LSHTM_analysis/foldx/test2/test2_process -type f -delete
|
361
mcsm/ind_scripts/format_results.py
Executable file
361
mcsm/ind_scripts/format_results.py
Executable file
|
@ -0,0 +1,361 @@
|
|||
#!/usr/bin/env python3
|
||||
#=======================================================================
|
||||
#TASK:
|
||||
#=======================================================================
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
#import requests
|
||||
import re
|
||||
#import time
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
import numpy as np
|
||||
from mcsm import *
|
||||
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||
os.getcwd()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
drug = 'isoniazid'
|
||||
gene = 'KatG'
|
||||
|
||||
#drug = args.drug
|
||||
#gene = args.gene
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
#==========
|
||||
# data dir
|
||||
#==========
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
#=======
|
||||
# input:
|
||||
#=======
|
||||
# 1) result_urls (from outdir)
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
in_filename = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
|
||||
infile = outdir + '/' + in_filename
|
||||
print('Input filename:', in_filename
|
||||
, '\nInput path(from output dir):', outdir
|
||||
, '\n=============================================================')
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
out_filename = gene.lower() + '_complex_mcsm_results.csv'
|
||||
outfile = outdir + '/' + out_filename
|
||||
print('Output filename:', out_filename
|
||||
, '\nOutput path:', outdir
|
||||
, '\n=============================================================')
|
||||
#%%=====================================================================
|
||||
def format_mcsm_output(mcsm_outputcsv):
|
||||
"""
|
||||
@param mcsm_outputcsv: file containing mcsm results for all muts
|
||||
which is the result of build_result_dict() being called for each
|
||||
mutation and then converting to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return formatted mcsm output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = mcsm_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase, remove spaces and use '_' to join
|
||||
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
||||
, '\n===================================================================')
|
||||
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
||||
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
|
||||
, 'Wild-type': 'wild_type' # one letter amino acid code
|
||||
, 'Position': 'position' # number
|
||||
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
||||
, 'Chain': 'chain' # single letter (caps)
|
||||
, 'Ligand ID': 'ligand_id' # 3-letter code
|
||||
, 'Distance to ligand': 'ligand_distance' # angstroms
|
||||
, 'DUET stability change': 'duet_stability_change'} # in kcal/mol
|
||||
|
||||
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
#%%===========================================================================
|
||||
#################################
|
||||
# populate mutationinformation
|
||||
# col which is currently blank
|
||||
#################################
|
||||
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
|
||||
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
|
||||
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
||||
print('checking after populating:\n', mcsm_data['mutationinformation']
|
||||
, '\n===================================================================')
|
||||
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within column: \mutationinformation')
|
||||
mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
|
||||
print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
|
||||
, '\n===================================================================')
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# sanity check: drop dupliate muts
|
||||
#############
|
||||
# shouldn't exist as this should be eliminated at the time of running mcsm
|
||||
print('Sanity check:'
|
||||
, '\nChecking duplicate mutations')
|
||||
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
|
||||
print('PASS: No duplicate mutations detected (as expected)'
|
||||
, '\nDim of data:', mcsm_data.shape
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL (but not fatal): Duplicate mutations detected'
|
||||
, '\nDim of df with duplicates:', mcsm_data.shape
|
||||
, 'Removing duplicate entries')
|
||||
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
|
||||
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
||||
, '\n===============================================================')
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# Create col: duet_outcome
|
||||
#############
|
||||
# classification based on DUET stability values
|
||||
print('Assigning col: duet_outcome based on DUET stability values')
|
||||
print('Sanity check:')
|
||||
# count positive values in the DUET column
|
||||
c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
|
||||
DUET_pos = c.get(key = 'duet_stability_change')
|
||||
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
|
||||
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
|
||||
mcsm_data['duet_outcome'].value_counts()
|
||||
if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
|
||||
print('PASS: DUET outcome assigned correctly')
|
||||
else:
|
||||
print('FAIL: DUET outcome assigned incorrectly'
|
||||
, '\nExpected no. of stabilising mutations:', DUET_pos
|
||||
, '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
|
||||
, '\n===============================================================')
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# Extract numeric
|
||||
# part of ligand_distance col
|
||||
#############
|
||||
# Extract only the numeric part from col: ligand_distance
|
||||
# number: '-?\d+\.?\d*'
|
||||
mcsm_data['ligand_distance']
|
||||
print('extracting numeric part of col: ligand_distance')
|
||||
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
|
||||
mcsm_data['ligand_distance']
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# Create 2 columns:
|
||||
# ligand_affinity_change and ligand_outcome
|
||||
#############
|
||||
# the numerical and categorical parts need to be extracted from column: PredAffLog
|
||||
# regex used
|
||||
# numerical part: '-?\d+\.?\d*'
|
||||
# categorocal part: '\b(\w+ing)\b'
|
||||
print('Extracting numerical and categorical parts from the col: PredAffLog')
|
||||
print('to create two columns: ligand_affinity_change and ligand_outcome'
|
||||
, '\n===================================================================')
|
||||
|
||||
# 1) Extracting the predicted affinity change (numerical part)
|
||||
mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
|
||||
print(mcsm_data['ligand_affinity_change'])
|
||||
|
||||
# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
|
||||
#aff_regex = re.compile(r'\b(\w+ing)\b')
|
||||
mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
|
||||
print(mcsm_data['ligand_outcome'])
|
||||
print(mcsm_data['ligand_outcome'].value_counts())
|
||||
|
||||
#############
|
||||
# changing spelling: British
|
||||
#############
|
||||
# ensuring spellings are consistent
|
||||
american_spl = mcsm_data['ligand_outcome'].value_counts()
|
||||
print('Changing to Bristish spellings for col: ligand_outcome')
|
||||
mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
|
||||
print(mcsm_data['ligand_outcome'].value_counts())
|
||||
british_spl = mcsm_data['ligand_outcome'].value_counts()
|
||||
# compare series values since index will differ from spelling change
|
||||
check = american_spl.values == british_spl.values
|
||||
if check.all():
|
||||
print('PASS: spelling change successfull'
|
||||
, '\nNo. of predicted affinity changes:\n', british_spl
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: spelling change unsucessfull'
|
||||
, '\nExpected:\n', american_spl
|
||||
, '\nGot:\n', british_spl
|
||||
, '\n===============================================================')
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# ensuring corrrect dtype columns
|
||||
#############
|
||||
# check dtype in cols
|
||||
print('Checking dtypes in all columns:\n', mcsm_data.dtypes
|
||||
, '\n===================================================================')
|
||||
print('Converting the following cols to numeric:'
|
||||
, '\nligand_distance'
|
||||
, '\nduet_stability_change'
|
||||
, '\nligand_affinity_change'
|
||||
, '\n===================================================================')
|
||||
|
||||
# using apply method to change stabilty and affinity values to numeric
|
||||
numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
|
||||
mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
|
||||
# check dtype in cols
|
||||
print('checking dtype after conversion')
|
||||
cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
|
||||
if cols_check.all():
|
||||
print('PASS: dtypes for selected cols:', numeric_cols
|
||||
, '\nchanged to numeric'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL:dtype change to numeric for selected cols unsuccessful'
|
||||
, '\n===============================================================')
|
||||
print(mcsm_data.dtypes)
|
||||
#%%===========================================================================
|
||||
|
||||
#############
|
||||
# scale duet values
|
||||
#############
|
||||
# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
duet_min = mcsm_data['duet_stability_change'].min()
|
||||
duet_max = mcsm_data['duet_stability_change'].max()
|
||||
|
||||
duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
|
||||
print('Raw duet scores:\n', mcsm_data['duet_stability_change']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
|
||||
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# scale affinity values
|
||||
#############
|
||||
# rescale values in affinity change col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
aff_min = mcsm_data['ligand_affinity_change'].min()
|
||||
aff_max = mcsm_data['ligand_affinity_change'].max()
|
||||
|
||||
aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
|
||||
print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
|
||||
#=============================================================================
|
||||
# Adding colname: wild_pos: sometimes useful for plotting and db
|
||||
print('Creating column: wild_pos')
|
||||
mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
|
||||
print(mcsm_data['wild_pos'].head())
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within column: wild_pos')
|
||||
mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
|
||||
print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
|
||||
, '\n===================================================================')
|
||||
#=============================================================================
|
||||
# Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
|
||||
print('Creating column: wild_chain_pos')
|
||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
|
||||
print(mcsm_data['wild_chain_pos'].head())
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within column: wild_chain_pos')
|
||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
|
||||
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
|
||||
, '\n===================================================================')
|
||||
#=============================================================================
|
||||
#%% ensuring dtypes are string for the non-numeric cols
|
||||
#) char cols
|
||||
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
|
||||
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
||||
|
||||
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
||||
cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
|
||||
|
||||
if cols_check_char.all():
|
||||
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL:dtype change to numeric for selected cols unsuccessful'
|
||||
, '\n===============================================================')
|
||||
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
|
||||
print(mcsm_data.dtypes)
|
||||
#=============================================================================
|
||||
# Removing PredAff log column as it is not needed?
|
||||
print('Removing col: PredAffLog since relevant info has been extracted from it')
|
||||
mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
|
||||
#=============================================================================
|
||||
#sort df by position for convenience
|
||||
print('Sorting df by position')
|
||||
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
||||
print('sorted df:\n', mcsm_data_fs.head())
|
||||
|
||||
# Ensuring column names are lowercase before output
|
||||
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
|
||||
#%%===========================================================================
|
||||
#############
|
||||
# sanity check before writing file
|
||||
#############
|
||||
expected_ncols_toadd = 5 # beware of hardcoded numbers
|
||||
dforig_len = dforig_shape[1]
|
||||
expected_cols = dforig_len + expected_ncols_toadd
|
||||
if len(mcsm_data_fs.columns) == expected_cols:
|
||||
print('PASS: formatting successful'
|
||||
, '\nformatted df has expected no. of cols:', expected_cols
|
||||
, '\ncolnames:', mcsm_data_fs.columns
|
||||
, '\n----------------------------------------------------------------'
|
||||
, '\ndtypes in cols:', mcsm_data_fs.dtypes
|
||||
, '\n----------------------------------------------------------------'
|
||||
, '\norig data shape:', dforig_shape
|
||||
, '\nformatted df shape:', mcsm_data_fs.shape
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: something went wrong in formatting df'
|
||||
, '\nLen of orig df:', dforig_len
|
||||
, '\nExpected number of cols to add:', expected_ncols_toadd
|
||||
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
|
||||
, '\nGot no. of cols:', len(mcsm_data_fs.columns)
|
||||
, '\nCheck formatting:'
|
||||
, '\ncheck hardcoded value:', expected_ncols_toadd
|
||||
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
|
||||
, '\n===============================================================')
|
||||
|
||||
return mcsm_data_fs
|
||||
#=======================================================================
|
||||
# call function
|
||||
mcsm_df_formatted = format_mcsm_output(infile)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_df_formatted.to_csv(outfile, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile', outfile
|
||||
, '\nExpected no. of rows:', len(mcsm_df_formatted)
|
||||
, '\nExpected no. of cols:', len(mcsm_df_formatted)
|
||||
, '\n=============================================================')
|
||||
#%%
|
||||
#End of script
|
310
mcsm/ind_scripts/format_results_notdef.py
Executable file
310
mcsm/ind_scripts/format_results_notdef.py
Executable file
|
@ -0,0 +1,310 @@
|
|||
#!/usr/bin/env python3
|
||||
#=======================================================================
|
||||
#TASK:
|
||||
#=======================================================================
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
#import requests
|
||||
import re
|
||||
#import time
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
import numpy as np
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||
os.getcwd()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'pncA'
|
||||
gene_match = gene + '_p.'
|
||||
#==========
|
||||
# dirs
|
||||
#==========
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
indir = datadir + '/' + drug + '/' + 'input'
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
|
||||
#=======
|
||||
# input:
|
||||
#=======
|
||||
# 1) result_urls (from outdir)
|
||||
in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
|
||||
infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
|
||||
print('Input file:', infile_mcsm_output
|
||||
, '\n=============================================================')
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
out_filename_mcsm_norm = gene.lower() + '_complex_mcsm_norm.csv'
|
||||
outfile_mcsm_norm = outdir + '/' + out_filename_mcsm_norm
|
||||
print('Output file:', out_filename_mcsm_norm
|
||||
, '\n=============================================================')
|
||||
|
||||
#=======================================================================
|
||||
print('Reading input file')
|
||||
mcsm_data_raw = pd.read_csv(infile_mcsm_output, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
# PredAffLog = affinity_change_log
|
||||
# "DUETStability_Kcalpermol = DUET_change_kcalpermol
|
||||
dforig_shape = mcsm_data.shape
|
||||
print('dim of infile:', dforig_shape)
|
||||
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase, remove spaces and use '_' to join
|
||||
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
||||
, '\n===================================================================')
|
||||
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
||||
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
|
||||
, 'Wild-type': 'wild_type' # one letter amino acid code
|
||||
, 'Position': 'position' # number
|
||||
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
||||
, 'Chain': 'chain' # single letter (caps)
|
||||
, 'Ligand ID': 'ligand_id' # 3-letter code
|
||||
, 'Distance to ligand': 'ligand_distance' # angstroms
|
||||
, 'DUET stability change': 'duet_stability_change'} # in kcal/mol
|
||||
|
||||
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
#%%===========================================================================
|
||||
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
|
||||
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
|
||||
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
||||
print('checking after populating:\n', mcsm_data['mutationinformation']
|
||||
, '\n===================================================================')
|
||||
|
||||
# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
|
||||
#print('removing white space within column: \mutationinformation')
|
||||
#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
|
||||
#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
|
||||
# , '\n===================================================================')
|
||||
#%% Remove whitespace from column
|
||||
#orig_dtypes = mcsm_data.dtypes
|
||||
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
|
||||
#mcsm_data.columns = mcsm_data.columns.str.strip()
|
||||
#new_dtypes = mcsm_data.dtypes
|
||||
#%%===========================================================================
|
||||
# very important
|
||||
print('Sanity check:'
|
||||
, '\nChecking duplicate mutations')
|
||||
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
|
||||
print('PASS: No duplicate mutations detected (as expected)'
|
||||
, '\nDim of data:', mcsm_data.shape
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL (but not fatal): Duplicate mutations detected'
|
||||
, '\nDim of df with duplicates:', mcsm_data.shape
|
||||
, 'Removing duplicate entries')
|
||||
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
|
||||
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
||||
, '\n===============================================================')
|
||||
#%%===========================================================================
|
||||
# create duet_outcome column: classification based on DUET stability values
|
||||
print('Assigning col: duet_outcome based on DUET stability values')
|
||||
print('Sanity check:')
|
||||
# count positive values in the DUET column
|
||||
c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
|
||||
DUET_pos = c.get(key = 'duet_stability_change')
|
||||
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
|
||||
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
|
||||
mcsm_data['duet_outcome'].value_counts()
|
||||
if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
|
||||
print('PASS: DUET outcome assigned correctly')
|
||||
else:
|
||||
print('FAIL: DUET outcome assigned incorrectly'
|
||||
, '\nExpected no. of stabilising mutations:', DUET_pos
|
||||
, '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
|
||||
, '\n===============================================================')
|
||||
#%%===========================================================================
|
||||
# Extract only the numeric part from col: ligand_distance
|
||||
# number: '-?\d+\.?\d*'
|
||||
mcsm_data['ligand_distance']
|
||||
print('extracting numeric part of col: ligand_distance')
|
||||
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
|
||||
mcsm_data['ligand_distance']
|
||||
#%%===========================================================================
|
||||
# create ligand_outcome column: classification based on affinity change values
|
||||
# the numerical and categorical parts need to be extracted from column: PredAffLog
|
||||
# regex used
|
||||
# number: '-?\d+\.?\d*'
|
||||
# category: '\b(\w+ing)\b'
|
||||
print('Extracting numerical and categorical parts from the col: PredAffLog')
|
||||
print('to create two columns: ligand_affinity_change and ligand_outcome'
|
||||
, '\n===================================================================')
|
||||
# Extracting the predicted affinity change (numerical part)
|
||||
mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
|
||||
print(mcsm_data['ligand_affinity_change'])
|
||||
# Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
|
||||
#aff_regex = re.compile(r'\b(\w+ing)\b')
|
||||
mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
|
||||
print(mcsm_data['ligand_outcome'])
|
||||
print(mcsm_data['ligand_outcome'].value_counts())
|
||||
|
||||
# ensuring spellings are consistent
|
||||
american_spl = mcsm_data['ligand_outcome'].value_counts()
|
||||
print('Changing to Bristish spellings for col: ligand_outcome')
|
||||
mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
|
||||
print(mcsm_data['ligand_outcome'].value_counts())
|
||||
british_spl = mcsm_data['ligand_outcome'].value_counts()
|
||||
# compare series values since index will differ from spelling change
|
||||
check = american_spl.values == british_spl.values
|
||||
if check.all():
|
||||
print('PASS: spelling change successfull'
|
||||
, '\nNo. of predicted affinity changes:\n', british_spl
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: spelling change unsucessfull'
|
||||
, '\nExpected:\n', american_spl
|
||||
, '\nGot:\n', british_spl
|
||||
, '\n===============================================================')
|
||||
#%%===========================================================================
|
||||
# check dtype in cols: ensure correct dtypes for cols
|
||||
print('Checking dtypes in all columns:\n', mcsm_data.dtypes
|
||||
, '\n===================================================================')
|
||||
#1) numeric cols
|
||||
print('Converting the following cols to numeric:'
|
||||
, '\nligand_distance'
|
||||
, '\nduet_stability_change'
|
||||
, '\nligand_affinity_change'
|
||||
, '\n===================================================================')
|
||||
# using apply method to change stabilty and affinity values to numeric
|
||||
numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
|
||||
mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
|
||||
|
||||
# check dtype in cols
|
||||
print('checking dtype after conversion')
|
||||
cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
|
||||
if cols_check.all():
|
||||
print('PASS: dtypes for selected cols:', numeric_cols
|
||||
, '\nchanged to numeric'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL:dtype change to numeric for selected cols unsuccessful'
|
||||
, '\n===============================================================')
|
||||
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
|
||||
print(mcsm_data.dtypes)
|
||||
#%%===========================================================================
|
||||
# Normalise values in DUET_change col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
duet_min = mcsm_data['duet_stability_change'].min()
|
||||
duet_max = mcsm_data['duet_stability_change'].max()
|
||||
|
||||
duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
|
||||
print('Raw duet scores:\n', mcsm_data['duet_stability_change']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
|
||||
#%%===========================================================================
|
||||
# Normalise values in affinity change col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
aff_min = mcsm_data['ligand_affinity_change'].min()
|
||||
aff_max = mcsm_data['ligand_affinity_change'].max()
|
||||
|
||||
aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_data['ligand_affinity_change']
|
||||
mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
|
||||
mcsm_data['affinity_scaled']
|
||||
print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
|
||||
#=============================================================================
|
||||
# Adding colname: wild_pos: sometimes useful for plotting and db
|
||||
print('Creating column: wild_pos')
|
||||
mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
|
||||
print(mcsm_data['wild_pos'].head())
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within column: wild_position')
|
||||
mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
|
||||
print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
|
||||
, '\n===================================================================')
|
||||
#=============================================================================
|
||||
#%% Adding colname: wild_chain_pos: sometimes useful for plotting and db and is explicit
|
||||
print('Creating column: wild_chain_pos')
|
||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
|
||||
print(mcsm_data['wild_chain_pos'].head())
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within column: wild_chain_pos')
|
||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
|
||||
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
|
||||
, '\n===================================================================')
|
||||
#=============================================================================
|
||||
#%% ensuring dtypes are string for the non-numeric cols
|
||||
#) char cols
|
||||
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
|
||||
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
||||
|
||||
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
||||
cols_check_char = mcsm_data.select_dtypes(include='object').columns.isin(char_cols)
|
||||
|
||||
if cols_check_char.all():
|
||||
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL:dtype change to numeric for selected cols unsuccessful'
|
||||
, '\n===============================================================')
|
||||
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
|
||||
print(mcsm_data.dtypes)
|
||||
#%%
|
||||
#=============================================================================
|
||||
#%% Removing PredAff log column as it is not needed?
|
||||
print('Removing col: PredAffLog since relevant info has been extracted from it')
|
||||
mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
|
||||
print(mcsm_data_f.head())
|
||||
#=============================================================================
|
||||
#%% sort df by position for convenience
|
||||
print('Sorting df by position')
|
||||
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
||||
print('sorted df:\n', mcsm_data_fs.head())
|
||||
#%%===========================================================================
|
||||
expected_ncols_toadd = 6 # beware of hardcoded numbers
|
||||
dforig_len = dforig_shape[1]
|
||||
expected_cols = dforig_len + expected_ncols_toadd
|
||||
if len(mcsm_data_fs.columns) == expected_cols:
|
||||
print('PASS: formatting successful'
|
||||
, '\nformatted df has expected no. of cols:', expected_cols
|
||||
, '\ncolnames:', mcsm_data_fs.columns
|
||||
, '\n----------------------------------------------------------------'
|
||||
, '\ndtypes in cols:', mcsm_data_fs.dtypes
|
||||
, '\n----------------------------------------------------------------'
|
||||
, '\norig data shape:', dforig_shape
|
||||
, '\nformatted df shape:', mcsm_data_fs.shape
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: something went wrong in formatting df'
|
||||
, '\nLen of orig df:', dforig_len
|
||||
, '\nExpected number of cols to add:', expected_ncols_toadd
|
||||
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
|
||||
, '\nGot no. of cols:', len(mcsm_data_fs.columns)
|
||||
, '\nCheck formatting:'
|
||||
, '\ncheck hardcoded value:', expected_ncols_toadd
|
||||
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
|
||||
, '\n===============================================================')
|
||||
#%%============================================================================
|
||||
# Ensuring column names are lowercase before output
|
||||
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_data_fs.to_csv(outfile_mcsm_norm, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_mcsm_norm
|
||||
, '\nExpected no. of rows:', len(mcsm_data_fs)
|
||||
, '\nExpected no. of cols:', len(mcsm_data_fs.columns)
|
||||
, '\n=============================================================')
|
||||
#%%
|
||||
#End of script
|
149
mcsm/ind_scripts/mcsm_results.py
Executable file
149
mcsm/ind_scripts/mcsm_results.py
Executable file
|
@ -0,0 +1,149 @@
|
|||
#!/usr/bin/env python3
|
||||
#=======================================================================
|
||||
#TASK:
|
||||
#=======================================================================
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
#import beautifulsoup4
|
||||
from csv import reader
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||
os.getcwd()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
#drug = 'isoniazid'
|
||||
#gene = 'KatG'
|
||||
|
||||
drug = 'cycloserine'
|
||||
gene = 'alr'
|
||||
|
||||
#drug = args.drug
|
||||
#gene = args.gene
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
#==========
|
||||
# data dir
|
||||
#==========
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
#=======
|
||||
# input:
|
||||
#=======
|
||||
# 1) result_urls (from outdir)
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
in_filename_url = gene.lower() + '_result_urls.txt' #(outfile, sub write_result_url)
|
||||
infile_url = outdir + '/' + in_filename_url
|
||||
print('Input filename:', in_filename_url
|
||||
, '\nInput path(from output dir):', outdir
|
||||
, '\n=============================================================')
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
out_filename = gene.lower() + '_mcsm_output.csv'
|
||||
outfile = outdir + '/' + out_filename
|
||||
print('Output filename:', out_filename
|
||||
, '\nOutput path:', outdir
|
||||
, '\n=============================================================')
|
||||
#=======================================================================
|
||||
def scrape_results(out_result_url):
|
||||
"""
|
||||
Extract results data using the result url
|
||||
|
||||
@params out_result_url: txt file containing result url
|
||||
one per line for each mutation
|
||||
@type string
|
||||
|
||||
returns: mcsm prediction results (raw)
|
||||
@type chr
|
||||
"""
|
||||
result_response = requests.get(out_result_url)
|
||||
# if results_response is not None:
|
||||
# page = results_page.text
|
||||
if result_response.status_code == 200:
|
||||
print('SUCCESS: Fetching results')
|
||||
else:
|
||||
print('FAIL: Could not fetch results'
|
||||
, '\nCheck if url is valid')
|
||||
# extract results using the html parser
|
||||
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
||||
# print(soup)
|
||||
web_result_raw = soup.find(class_ = 'span4').get_text()
|
||||
|
||||
return web_result_raw
|
||||
|
||||
|
||||
def build_result_dict(web_result_raw):
|
||||
"""
|
||||
Build dict of mcsm output for a single mutation
|
||||
Format web results which is preformatted to enable building result dict
|
||||
# preformatted string object: Problematic!
|
||||
# make format consistent
|
||||
|
||||
@params web_result_raw: directly from html parser extraction
|
||||
@type string
|
||||
|
||||
@returns result dict
|
||||
@type {}
|
||||
"""
|
||||
|
||||
# remove blank lines from web_result_raw
|
||||
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
|
||||
|
||||
# affinity change and DUET stability change cols are are split over
|
||||
# multiple lines and Mutation information is empty!
|
||||
mytext = mytext.replace('ange:\n', 'ange: ')
|
||||
#print(mytext)
|
||||
|
||||
# initiliase result_dict
|
||||
result_dict = {}
|
||||
for line in mytext.split('\n'):
|
||||
fields = line.split(':')
|
||||
# print(fields)
|
||||
if len(fields) > 1: # since Mutaton information is empty
|
||||
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
|
||||
result_dict.update(dict_entry)
|
||||
|
||||
return result_dict
|
||||
#=====================================================================
|
||||
#%% call function
|
||||
#request_results(infile_url)
|
||||
#response = requests.get('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1586364780.41')
|
||||
results_interim = scrape_results('http://biosig.unimelb.edu.au/mcsm_lig/results_prediction/1587053996.55')
|
||||
result_dict = build_result_dict(results_interim)
|
||||
|
||||
output_df = pd.DataFrame()
|
||||
|
||||
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||
infile_len = os.popen('wc -l < %s' % infile_url).read() # quicker than using Python :-)
|
||||
print('Total URLs:',infile_len)
|
||||
|
||||
with open(infile_url, 'r') as urlfile:
|
||||
for line in urlfile:
|
||||
url_line = line.strip()
|
||||
# response = request_results(url_line)
|
||||
#response = requests.get(url_line)
|
||||
results_interim = scrape_results(url_line)
|
||||
result_dict = build_result_dict(results_interim)
|
||||
print('Processing URL: %s of %s' % (url_counter, infile_len))
|
||||
df = pd.DataFrame(result_dict, index=[url_counter])
|
||||
url_counter += 1
|
||||
output_df = output_df.append(df)
|
||||
|
||||
#print(output_df)
|
||||
output_df.to_csv(outfile, index = None, header = True)
|
240
mcsm/ind_scripts/run_mcsm.py
Executable file
240
mcsm/ind_scripts/run_mcsm.py
Executable file
|
@ -0,0 +1,240 @@
|
|||
#!/usr/bin/env python3
|
||||
#=======================================================================
|
||||
#TASK:
|
||||
#=======================================================================
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
#from csv import reader
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||
os.getcwd()
|
||||
#=======================================================================
|
||||
#%% command line args
|
||||
#arg_parser = argparse.ArgumentParser()
|
||||
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
|
||||
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
|
||||
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'TESTDRUG')
|
||||
#arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = 'testGene') # case sensitive
|
||||
#args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
#drug = 'isoniazid'
|
||||
#gene = 'KatG'
|
||||
|
||||
drug = 'cycloserine'
|
||||
gene = 'alr'
|
||||
|
||||
|
||||
#drug = args.drug
|
||||
#gene = args.gene
|
||||
|
||||
gene_match = gene + '_p.'
|
||||
#==========
|
||||
# data dir
|
||||
#==========
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
|
||||
#==========
|
||||
# input dir
|
||||
#==========
|
||||
indir = datadir + '/' + drug + '/' + 'input'
|
||||
|
||||
#==========
|
||||
# output dir
|
||||
#==========
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
|
||||
#=======
|
||||
# input files:
|
||||
#=======
|
||||
# 1) pdb file
|
||||
in_filename_pdb = gene.lower() + '_complex.pdb'
|
||||
infile_pdb = indir + '/' + in_filename_pdb
|
||||
print('Input pdb file:', infile_pdb
|
||||
, '\n=============================================================')
|
||||
|
||||
# 2) mcsm snps
|
||||
in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile2, from data_extraction.py)
|
||||
infile_snps = outdir + '/' + in_filename_snps
|
||||
print('Input mutation file:', infile_snps
|
||||
, '\n=============================================================')
|
||||
|
||||
#=======
|
||||
# output files
|
||||
#=======
|
||||
|
||||
# 1) result urls file
|
||||
#result_urls_filename = gene.lower() + '_result_urls.txt'
|
||||
#result_urls = outdir + '/' + result_urls_filename
|
||||
|
||||
# 2) invalid mutations file
|
||||
#invalid_muts_filename = gene.lower() + '_invalid_mutations.txt'
|
||||
#outfile_invalid_muts = outdir + '/' + invalid_muts_filename
|
||||
|
||||
#print('Result url file:', result_urls
|
||||
# , '\n==================================================================='
|
||||
# , '\nOutput invalid muations file:', outfile_invalid_muts
|
||||
# , '\n===================================================================')
|
||||
|
||||
#%% global variables
|
||||
host = "http://biosig.unimelb.edu.au"
|
||||
prediction_url = f"{host}/mcsm_lig/prediction"
|
||||
#=======================================================================
|
||||
def format_data(data_file):
|
||||
"""
|
||||
Read file containing SNPs for mcsm analysis and remove duplicates
|
||||
|
||||
@param data_file csv file containing nsSNPs for given drug and gene.
|
||||
csv file format:
|
||||
single column with no headers with nsSNP format as below:
|
||||
A1B
|
||||
B2C
|
||||
@type data_file: string
|
||||
|
||||
@return unique SNPs
|
||||
@type list
|
||||
"""
|
||||
data = pd.read_csv(data_file, header = None, index_col = False)
|
||||
data = data.drop_duplicates()
|
||||
mutation_list = data[0].tolist()
|
||||
# print(data.head())
|
||||
return mutation_list
|
||||
|
||||
def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name):
|
||||
"""
|
||||
Makes a POST request for a ligand affinity prediction.
|
||||
|
||||
@param pdb_file: valid path to pdb structure
|
||||
@type string
|
||||
|
||||
@param mutation: single mutation of the format: {WT}<POS>{Mut}
|
||||
@type string
|
||||
|
||||
@param chain: single-letter(caps)
|
||||
@type chr
|
||||
|
||||
@param lig_id: 3-letter code (should match pdb file)
|
||||
@type string
|
||||
|
||||
@param wt affinity: in nM
|
||||
@type number
|
||||
|
||||
@param prediction_url: mcsm url for prediction
|
||||
@type string
|
||||
|
||||
@return response object
|
||||
@type object
|
||||
"""
|
||||
with open(pdb_file, "rb") as pdb_file:
|
||||
files = {"wild": pdb_file}
|
||||
body = {
|
||||
"mutation": mutation,
|
||||
"chain": chain,
|
||||
"lig_id": ligand_id,
|
||||
"affin_wt": wt_affinity
|
||||
}
|
||||
|
||||
response = requests.post(prediction_url, files = files, data = body)
|
||||
# print(response.status_code)
|
||||
# result_status = response.raise_for_status()
|
||||
if response.history:
|
||||
# if result_status is not None: # doesn't work!
|
||||
print('PASS: valid mutation submitted. Fetching result url')
|
||||
# response = requests.post(prediction_url, files = files, data = body)
|
||||
# return response
|
||||
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
|
||||
url = host + url_match.group()
|
||||
#===============
|
||||
# writing file: result urls
|
||||
#===============
|
||||
out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
|
||||
myfile = open(out_url_file, 'a')
|
||||
myfile.write(url + '\n')
|
||||
myfile.close()
|
||||
|
||||
else:
|
||||
print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
|
||||
, '\nSkipping to the next mutation in file...')
|
||||
#===============
|
||||
# writing file: invalid mutations
|
||||
#===============
|
||||
out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
|
||||
failed_muts = open(out_error_file, 'a')
|
||||
failed_muts.write(mutation + '\n')
|
||||
failed_muts.close()
|
||||
|
||||
#def write_result_url(holding_page, out_result_url, host):
|
||||
# """
|
||||
# Extract and write results url from the holding page returned after
|
||||
# requesting a calculation.
|
||||
|
||||
# @param holding_page: response object containinig html content
|
||||
# @type object
|
||||
|
||||
# @param out_result_url: txt file containing urls for mcsm results
|
||||
# @type string
|
||||
|
||||
# @param host: mcsm server name
|
||||
# @type string
|
||||
|
||||
# @return None, writes a file containing result urls (= total no. of muts)
|
||||
# """
|
||||
# if holding_page:
|
||||
# url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', holding_page.text)
|
||||
# url = host + url_match.group()
|
||||
#===============
|
||||
# writing file
|
||||
#===============
|
||||
# myfile = open(out_result_url, 'a')
|
||||
# myfile.write(url+'\n')
|
||||
# myfile.close()
|
||||
# print(myfile)
|
||||
# return url
|
||||
#%%
|
||||
#=======================================================================
|
||||
# variables to run mcsm lig predictions
|
||||
#pdb_file = infile_snps_pdb
|
||||
my_chain = 'A'
|
||||
my_ligand_id = 'DCS'
|
||||
my_affinity = 10
|
||||
|
||||
print('Result urls and error file (if any) will be written in: ', outdir)
|
||||
|
||||
# call function to format data to remove duplicate snps before submitting job
|
||||
mcsm_muts = format_data(infile_snps)
|
||||
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
|
||||
print('Total SNPs for', gene, ':', infile_snps_len)
|
||||
for mcsm_mut in mcsm_muts:
|
||||
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
|
||||
print('Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
||||
# function call: to request mcsm prediction
|
||||
# which writes file containing url for valid submissions and invalid muts to respective files
|
||||
holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
||||
# holding_page = request_calculation(infile_pdb, mcsm_mut, my_chain, my_ligand_id, my_affinity, prediction_url, outdir, gene)
|
||||
time.sleep(1)
|
||||
mut_count += 1
|
||||
# result_url = write_result_url(holding_page, result_urls, host)
|
||||
|
||||
print('Request submitted'
|
||||
, '\nCAUTION: Processing will take at least ten'
|
||||
, 'minutes, but will be longer for more mutations.')
|
||||
|
||||
#%%
|
||||
|
||||
|
||||
|
494
mcsm/mcsm.py
Normal file
494
mcsm/mcsm.py
Normal file
|
@ -0,0 +1,494 @@
|
|||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
import numpy as np
|
||||
#from csv import reader
|
||||
from mcsm import *
|
||||
#==============================
|
||||
#%% global variables for defs
|
||||
#==============================
|
||||
#%%
|
||||
|
||||
def format_data(data_file):
|
||||
"""
|
||||
Read file containing SNPs for mcsm analysis and remove duplicates
|
||||
|
||||
@param data_file csv file containing nsSNPs for given drug and gene.
|
||||
csv file format:
|
||||
single column with no headers with nsSNP format as below:
|
||||
A1B
|
||||
B2C
|
||||
@type data_file: string
|
||||
|
||||
@return unique SNPs
|
||||
@type list
|
||||
"""
|
||||
data = pd.read_csv(data_file, header = None, index_col = False)
|
||||
data = data.drop_duplicates()
|
||||
mutation_list = data[0].tolist()
|
||||
# print(data.head())
|
||||
return mutation_list
|
||||
|
||||
# FIXME: documentation
|
||||
def request_calculation(pdb_file, mutation, chain, ligand_id, wt_affinity, prediction_url, output_dir, gene_name, host):
|
||||
"""
|
||||
Makes a POST request for a ligand affinity prediction.
|
||||
|
||||
@param pdb_file: valid path to pdb structure
|
||||
@type string
|
||||
|
||||
@param mutation: single mutation of the format: {WT}<POS>{Mut}
|
||||
@type string
|
||||
|
||||
@param chain: single-letter(caps)
|
||||
@type chr
|
||||
|
||||
@param lig_id: 3-letter code (should match pdb file)
|
||||
@type string
|
||||
|
||||
@param wt affinity: in nM
|
||||
@type number
|
||||
|
||||
@param prediction_url: mcsm url for prediction
|
||||
@type string
|
||||
|
||||
@return response object
|
||||
@type object
|
||||
"""
|
||||
with open(pdb_file, "rb") as pdb_file:
|
||||
files = {"wild": pdb_file}
|
||||
body = {
|
||||
"mutation": mutation,
|
||||
"chain": chain,
|
||||
"lig_id": ligand_id,
|
||||
"affin_wt": wt_affinity
|
||||
}
|
||||
|
||||
response = requests.post(prediction_url, files = files, data = body)
|
||||
#print(response.status_code)
|
||||
#result_status = response.raise_for_status()
|
||||
if response.history:
|
||||
# if result_status is not None: # doesn't work!
|
||||
print('PASS: valid mutation submitted. Fetching result url')
|
||||
|
||||
#return response
|
||||
url_match = re.search('/mcsm_lig/results_prediction/.+(?=")', response.text)
|
||||
url = host + url_match.group()
|
||||
#===============
|
||||
# writing file: result urls
|
||||
#===============
|
||||
out_url_file = output_dir + '/' + gene_name.lower() + '_result_urls.txt'
|
||||
myfile = open(out_url_file, 'a')
|
||||
myfile.write(url + '\n')
|
||||
myfile.close()
|
||||
|
||||
else:
|
||||
print('ERROR: invalid mutation! Wild-type residue doesn\'t match pdb file.'
|
||||
, '\nSkipping to the next mutation in file...')
|
||||
#===============
|
||||
# writing file: invalid mutations
|
||||
#===============
|
||||
out_error_file = output_dir + '/' + gene_name.lower() + '_errors.txt'
|
||||
failed_muts = open(out_error_file, 'a')
|
||||
failed_muts.write(mutation + '\n')
|
||||
failed_muts.close()
|
||||
|
||||
#=======================================================================
|
||||
def scrape_results(result_url):
|
||||
"""
|
||||
Extract results data using the result url
|
||||
|
||||
@params result_url: txt file containing result url
|
||||
one per line for each mutation
|
||||
@type string
|
||||
|
||||
returns: mcsm prediction results (raw)
|
||||
@type chr
|
||||
"""
|
||||
result_response = requests.get(result_url)
|
||||
# if results_response is not None:
|
||||
# page = results_page.text
|
||||
if result_response.status_code == 200:
|
||||
print('Fetching results')
|
||||
# extract results using the html parser
|
||||
soup = BeautifulSoup(result_response.text, features = 'html.parser')
|
||||
# print(soup)
|
||||
web_result_raw = soup.find(class_ = 'span4').get_text()
|
||||
#metatags = soup.find_all('meta')
|
||||
metatags = soup.find_all('meta', attrs={'http-equiv':'refresh'})
|
||||
#print('meta tags:', metatags)
|
||||
if metatags:
|
||||
print('WARNING: Submission not ready for URL:', result_url)
|
||||
# TODO: Add logging
|
||||
#if debug:
|
||||
# debug.warning('submission not ready for URL:', result_url)
|
||||
else:
|
||||
return web_result_raw
|
||||
else:
|
||||
sys.exit('FAIL: Could not fetch results'
|
||||
, '\nCheck if url is valid')
|
||||
|
||||
|
||||
def build_result_dict(web_result_raw):
|
||||
"""
|
||||
Build dict of mcsm output for a single mutation
|
||||
Format web results which is preformatted to enable building result dict
|
||||
# preformatted string object: Problematic!
|
||||
# make format consistent
|
||||
|
||||
@params web_result_raw: directly from html parser extraction
|
||||
@type string
|
||||
|
||||
@returns result dict
|
||||
@type {}
|
||||
"""
|
||||
# remove blank lines from web_result_raw
|
||||
mytext = os.linesep.join([s for s in web_result_raw.splitlines() if s])
|
||||
|
||||
# affinity change and DUET stability change cols are are split over
|
||||
# multiple lines and Mutation information is empty!
|
||||
mytext = mytext.replace('ange:\n', 'ange: ')
|
||||
#print(mytext)
|
||||
|
||||
# initiliase result_dict
|
||||
result_dict = {}
|
||||
for line in mytext.split('\n'):
|
||||
fields = line.split(':')
|
||||
#print(fields)
|
||||
if len(fields) > 1: # since Mutaton information is empty
|
||||
dict_entry = dict([(x, y) for x, y in zip(fields[::2], fields[1::2])])
|
||||
result_dict.update(dict_entry)
|
||||
print(result_dict)
|
||||
return result_dict
|
||||
#%%
|
||||
#=======================================================================
|
||||
def format_mcsm_output(mcsm_outputcsv):
|
||||
"""
|
||||
@param mcsm_outputcsv: file containing mcsm results for all muts
|
||||
which is the result of build_result_dict() being called for each
|
||||
mutation and then converting to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return formatted mcsm output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = mcsm_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase, remove spaces and use '_' to join
|
||||
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
|
||||
, '\n=======================================================')
|
||||
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
|
||||
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
|
||||
, 'Wild-type': 'wild_type' # one letter amino acid code
|
||||
, 'Position': 'position' # number
|
||||
, 'Mutant-type': 'mutant_type' # one letter amino acid code
|
||||
, 'Chain': 'chain' # single letter (caps)
|
||||
, 'Ligand ID': 'ligand_id' # 3-letter code
|
||||
, 'Distance to ligand': 'ligand_distance' # angstroms
|
||||
, 'DUET stability change': 'duet_stability_change'} # in kcal/mol
|
||||
|
||||
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
#%%=====================================================================
|
||||
#################################
|
||||
# populate mutationinformation
|
||||
# col which is currently blank
|
||||
#################################
|
||||
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
|
||||
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
|
||||
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
|
||||
print('checking after populating:\n', mcsm_data['mutationinformation']
|
||||
, '\n=======================================================')
|
||||
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within column: \mutationinformation')
|
||||
mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
|
||||
print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
|
||||
, '\n=======================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# sanity check: drop dupliate muts
|
||||
#############
|
||||
# shouldn't exist as this should be eliminated at the time of running mcsm
|
||||
print('Sanity check:'
|
||||
, '\nChecking duplicate mutations')
|
||||
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
|
||||
print('PASS: No duplicate mutations detected (as expected)'
|
||||
, '\nDim of data:', mcsm_data.shape
|
||||
, '\n===================================================')
|
||||
else:
|
||||
print('WARNING: Duplicate mutations detected'
|
||||
, '\nDim of df with duplicates:', mcsm_data.shape
|
||||
, 'Removing duplicate entries')
|
||||
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
|
||||
print('Dim of data after removing duplicate muts:', mcsm_data.shape
|
||||
, '\n===========================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# Create col: duet_outcome
|
||||
#############
|
||||
# classification based on DUET stability values
|
||||
print('Assigning col: duet_outcome based on DUET stability values')
|
||||
print('Sanity check:')
|
||||
# count positive values in the DUET column
|
||||
c = mcsm_data[mcsm_data['duet_stability_change']>=0].count()
|
||||
DUET_pos = c.get(key = 'duet_stability_change')
|
||||
# Assign category based on sign (+ve : Stabilising, -ve: Destabilising, Mind the spelling (British spelling))
|
||||
mcsm_data['duet_outcome'] = np.where(mcsm_data['duet_stability_change']>=0, 'Stabilising', 'Destabilising')
|
||||
print('DUET Outcome:', mcsm_data['duet_outcome'].value_counts())
|
||||
#if DUET_pos == mcsm_data['duet_outcome'].value_counts()['Stabilising']:
|
||||
# print('PASS: DUET outcome assigned correctly')
|
||||
#else:
|
||||
# print('FAIL: DUET outcome assigned incorrectly'
|
||||
# , '\nExpected no. of stabilising mutations:', DUET_pos
|
||||
# , '\nGot no. of stabilising mutations', mcsm_data['duet_outcome'].value_counts()['Stabilising']
|
||||
# , '\n======================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# Extract numeric
|
||||
# part of ligand_distance col
|
||||
#############
|
||||
# Extract only the numeric part from col: ligand_distance
|
||||
# number: '-?\d+\.?\d*'
|
||||
mcsm_data['ligand_distance']
|
||||
print('extracting numeric part of col: ligand_distance')
|
||||
mcsm_data['ligand_distance'] = mcsm_data['ligand_distance'].str.extract('(\d+\.?\d*)')
|
||||
print('Ligand Distance:',mcsm_data['ligand_distance'])
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# Create 2 columns:
|
||||
# ligand_affinity_change and ligand_outcome
|
||||
#############
|
||||
# the numerical and categorical parts need to be extracted from column: PredAffLog
|
||||
# regex used
|
||||
# numerical part: '-?\d+\.?\d*'
|
||||
# categorocal part: '\b(\w+ing)\b'
|
||||
print('Extracting numerical and categorical parts from the col: PredAffLog')
|
||||
print('to create two columns: ligand_affinity_change and ligand_outcome'
|
||||
, '\n=======================================================')
|
||||
|
||||
# 1) Extracting the predicted affinity change (numerical part)
|
||||
mcsm_data['ligand_affinity_change'] = mcsm_data['PredAffLog'].str.extract('(-?\d+\.?\d*)', expand = True)
|
||||
print(mcsm_data['ligand_affinity_change'])
|
||||
|
||||
# 2) Extracting the categorical part (Destabillizing and Stabilizing) using word boundary ('ing')
|
||||
#aff_regex = re.compile(r'\b(\w+ing)\b')
|
||||
mcsm_data['ligand_outcome']= mcsm_data['PredAffLog'].str.extract(r'(\b\w+ing\b)', expand = True)
|
||||
print(mcsm_data['ligand_outcome'])
|
||||
print(mcsm_data['ligand_outcome'].value_counts())
|
||||
|
||||
#############
|
||||
# changing spelling: British
|
||||
#############
|
||||
# ensuring spellings are consistent
|
||||
american_spl = mcsm_data['ligand_outcome'].value_counts()
|
||||
print('Changing to Bristish spellings for col: ligand_outcome')
|
||||
mcsm_data['ligand_outcome'].replace({'Destabilizing': 'Destabilising', 'Stabilizing': 'Stabilising'}, inplace = True)
|
||||
print(mcsm_data['ligand_outcome'].value_counts())
|
||||
british_spl = mcsm_data['ligand_outcome'].value_counts()
|
||||
# compare series values since index will differ from spelling change
|
||||
check = american_spl.values == british_spl.values
|
||||
if check.all():
|
||||
print('PASS: spelling change successfull'
|
||||
, '\nNo. of predicted affinity changes:\n', british_spl
|
||||
, '\n===================================================')
|
||||
else:
|
||||
sys.exit('FAIL: spelling change unsucessfull'
|
||||
, '\nExpected:\n', american_spl
|
||||
, '\nGot:\n', british_spl
|
||||
, '\n===================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# ensuring corrrect dtype for numeric columns
|
||||
#############
|
||||
# check dtype in cols
|
||||
print('Checking dtypes in all columns:\n', mcsm_data.dtypes
|
||||
, '\n=======================================================')
|
||||
print('Converting the following cols to numeric:'
|
||||
, '\nligand_distance'
|
||||
, '\nduet_stability_change'
|
||||
, '\nligand_affinity_change'
|
||||
, '\n=======================================================')
|
||||
|
||||
# using apply method to change stabilty and affinity values to numeric
|
||||
numeric_cols = ['duet_stability_change', 'ligand_affinity_change', 'ligand_distance']
|
||||
mcsm_data[numeric_cols] = mcsm_data[numeric_cols].apply(pd.to_numeric)
|
||||
# check dtype in cols
|
||||
print('checking dtype after conversion')
|
||||
cols_check = mcsm_data.select_dtypes(include='float64').columns.isin(numeric_cols)
|
||||
if cols_check.all():
|
||||
print('PASS: dtypes for selected cols:', numeric_cols
|
||||
, '\nchanged to numeric'
|
||||
, '\n===================================================')
|
||||
else:
|
||||
sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
|
||||
, '\n===================================================')
|
||||
print(mcsm_data.dtypes)
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# scale duet values
|
||||
#############
|
||||
# Rescale values in DUET_change col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
duet_min = mcsm_data['duet_stability_change'].min()
|
||||
duet_max = mcsm_data['duet_stability_change'].max()
|
||||
|
||||
duet_scale = lambda x : x/abs(duet_min) if x < 0 else (x/duet_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_data['duet_scaled'] = mcsm_data['duet_stability_change'].apply(duet_scale)
|
||||
print('Raw duet scores:\n', mcsm_data['duet_stability_change']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled duet scores:\n', mcsm_data['duet_scaled'])
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# additional check added
|
||||
c2 = mcsm_data[mcsm_data['duet_scaled']>=0].count()
|
||||
DUET_pos2 = c2.get(key = 'duet_scaled')
|
||||
|
||||
if DUET_pos == DUET_pos2:
|
||||
print('\nPASS: DUET values scaled correctly')
|
||||
else:
|
||||
print('\nFAIL: DUET values scaled numbers MISmatch'
|
||||
, '\nExpected number:', DUET_pos
|
||||
, '\nGot:', DUET_pos2
|
||||
, '\n======================================================')
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# scale affinity values
|
||||
#############
|
||||
# rescale values in affinity change col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
aff_min = mcsm_data['ligand_affinity_change'].min()
|
||||
aff_max = mcsm_data['ligand_affinity_change'].max()
|
||||
|
||||
aff_scale = lambda x : x/abs(aff_min) if x < 0 else (x/aff_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_data['affinity_scaled'] = mcsm_data['ligand_affinity_change'].apply(aff_scale)
|
||||
print('Raw affinity scores:\n', mcsm_data['ligand_affinity_change']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled affinity scores:\n', mcsm_data['affinity_scaled'])
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# additional check added
|
||||
c_lig = mcsm_data[mcsm_data['ligand_affinity_change']>=0].count()
|
||||
Lig_pos = c_lig.get(key = 'ligand_affinity_change')
|
||||
|
||||
c_lig2 = mcsm_data[mcsm_data['affinity_scaled']>=0].count()
|
||||
Lig_pos2 = c_lig2.get(key = 'affinity_scaled')
|
||||
|
||||
if Lig_pos == Lig_pos2:
|
||||
print('\nPASS: Ligand affintiy values scaled correctly')
|
||||
else:
|
||||
print('\nFAIL: Ligand affinity values scaled numbers MISmatch'
|
||||
, '\nExpected number:', Lig_pos
|
||||
, '\nGot:', Lig_pos2
|
||||
, '\n======================================================')
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# adding column: wild_pos
|
||||
# useful for plots and db
|
||||
#############
|
||||
print('Creating column: wild_pos')
|
||||
mcsm_data['wild_pos'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str)
|
||||
print(mcsm_data['wild_pos'].head())
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within created column: wild_pos')
|
||||
mcsm_data['wild_pos'] = mcsm_data['wild_pos'].str.replace(' ', '')
|
||||
print('Correctly formatted column: wild_pos\n', mcsm_data['wild_pos'].head()
|
||||
, '\n=========================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# adding column: wild_chain_pos
|
||||
# useful for plots and db and its explicit
|
||||
#############
|
||||
print('Creating column: wild_chain_pos')
|
||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_type'] + mcsm_data['chain'] + mcsm_data['position'].astype(str)
|
||||
print(mcsm_data['wild_chain_pos'].head())
|
||||
# Remove spaces b/w pasted columns
|
||||
print('removing white space within created column: wild_chain_pos')
|
||||
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
|
||||
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
|
||||
, '\n=========================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# ensuring corrrect dtype in non-numeric cols
|
||||
#############
|
||||
#) char cols
|
||||
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain', 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
|
||||
|
||||
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
|
||||
cols_check_char = mcsm_data.select_dtypes(include = 'object').columns.isin(char_cols)
|
||||
|
||||
if cols_check_char.all():
|
||||
print('PASS: dtypes for char cols:', char_cols, 'are indeed string'
|
||||
, '\n===================================================')
|
||||
else:
|
||||
sys.exit('FAIL:dtype change to numeric for selected cols unsuccessful'
|
||||
, '\n===================================================')
|
||||
#mcsm_data['ligand_distance', 'ligand_affinity_change'].apply(is_numeric_dtype(mcsm_data['ligand_distance', 'ligand_affinity_change']))
|
||||
print(mcsm_data.dtypes)
|
||||
#%%=====================================================================
|
||||
# Removing PredAff log column as it is not needed?
|
||||
print('Removing col: PredAffLog since relevant info has been extracted from it')
|
||||
mcsm_data_f = mcsm_data.drop(columns = ['PredAffLog'])
|
||||
#%%=====================================================================
|
||||
# sort df by position for convenience
|
||||
print('Sorting df by position')
|
||||
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
|
||||
print('sorted df:\n', mcsm_data_fs.head())
|
||||
|
||||
# Ensuring column names are lowercase before output
|
||||
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# sanity check before writing file
|
||||
#############
|
||||
expected_ncols_toadd = 6 # beware hardcoding!
|
||||
dforig_len = dforig_shape[1]
|
||||
expected_cols = dforig_len + expected_ncols_toadd
|
||||
if len(mcsm_data_fs.columns) == expected_cols:
|
||||
print('PASS: formatting successful'
|
||||
, '\nformatted df has expected no. of cols:', expected_cols
|
||||
, '\n---------------------------------------------------'
|
||||
, '\ncolnames:', mcsm_data_fs.columns
|
||||
, '\n---------------------------------------------------'
|
||||
, '\ndtypes in cols:', mcsm_data_fs.dtypes
|
||||
, '\n---------------------------------------------------'
|
||||
, '\norig data shape:', dforig_shape
|
||||
, '\nformatted df shape:', mcsm_data_fs.shape
|
||||
, '\n===================================================')
|
||||
else:
|
||||
print('FAIL: something went wrong in formatting df'
|
||||
, '\nLen of orig df:', dforig_len
|
||||
, '\nExpected number of cols to add:', expected_ncols_toadd
|
||||
, '\nExpected no. of cols:', expected_cols, '(', dforig_len, '+', expected_ncols_toadd, ')'
|
||||
, '\nGot no. of cols:', len(mcsm_data_fs.columns)
|
||||
, '\nCheck formatting:'
|
||||
, '\ncheck hardcoded value:', expected_ncols_toadd
|
||||
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
|
||||
, '\n===================================================')
|
||||
sys.exit()
|
||||
|
||||
return mcsm_data_fs
|
||||
|
219
mcsm/run_mcsm.py
Executable file
219
mcsm/run_mcsm.py
Executable file
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/env python3
|
||||
# mCSM Wrapper
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import pandas as pd
|
||||
|
||||
from mcsm import *
|
||||
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug', help='drug name' , required=True)
|
||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', required=True) # case sensitive
|
||||
arg_parser.add_argument('-s', '--stage', help='mCSM Pipeline Stage', default = 'get', choices=['submit', 'get', 'format'], required=True)
|
||||
arg_parser.add_argument('-H', '--host', help='mCSM Server', default = 'http://biosig.unimelb.edu.au')
|
||||
arg_parser.add_argument('-U', '--url', help='mCSM Server URL', default = 'http://biosig.unimelb.edu.au/mcsm_lig/prediction')
|
||||
arg_parser.add_argument('-c', '--chain', help='Chain ID as per PDB, Case sensitive', default = 'A')
|
||||
arg_parser.add_argument('-l','--ligand', help='Ligand ID as per PDB, Case sensitive. REQUIRED only in "submit" stage', default = None)
|
||||
arg_parser.add_argument('-a','--affinity', help='Affinity in nM. REQUIRED only in "submit" stage', default = 10) #0.99 for pnca, gid, embb. For SP targets (alr,katg, rpob), use 10.
|
||||
|
||||
arg_parser.add_argument('-pdb','--pdb_file', help = 'PDB File')
|
||||
arg_parser.add_argument('-m','--mutation_file', help = 'Mutation File, mcsm style')
|
||||
|
||||
arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
|
||||
# stage: submit, output url file
|
||||
arg_parser.add_argument('--url_file', help = 'Output results url file. The result of stage "submit". By default, it creates a output result url file in the output dir: "output_dir + gene.lower() + _result_urls.txt" ')
|
||||
|
||||
# stage: get, intermediate mcsm output file
|
||||
arg_parser.add_argument('--outfile_scraped', help = 'Output mcsm results scraped. The result of stage "get". By default, it creates an interim output file in the output dir: "output_dir + gene.lower() +_mcsm_output.csv" ')
|
||||
|
||||
# stage: format, formatted output with scaled values, etc
|
||||
# FIXME: Don't call this stage until you have ALL the interim results for your snps as the normalisation will be affected!
|
||||
arg_parser.add_argument('--outfile_formatted', help = 'Output mcsm results formatted. The result of stage "format". By default, it creates a formatted output file in the output dir: "output_dir + gene.lower() + _complex_mcsm_norm.csv" ')
|
||||
|
||||
arg_parser.add_argument('--debug', action='store_true', help = 'Debug Mode')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variables
|
||||
#host = "http://biosig.unimelb.edu.au"
|
||||
#prediction_url = f"{host}/mcsm_lig/prediction"
|
||||
#drug = ''
|
||||
#gene = ''
|
||||
#%%=====================================================================
|
||||
# Command line options
|
||||
gene = args.gene
|
||||
drug = args.drug
|
||||
stage = args.stage
|
||||
chain = args.chain
|
||||
ligand = args.ligand
|
||||
affinity = args.affinity
|
||||
pdb_filename = args.pdb_file
|
||||
mutation_filename = args.mutation_file
|
||||
|
||||
result_urls = args.url_file
|
||||
mcsm_output = args.outfile_scraped
|
||||
outfile_format = args.outfile_formatted
|
||||
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
|
||||
DEBUG = args.debug
|
||||
|
||||
# Actual Globals :-)
|
||||
host = args.host
|
||||
prediction_url = args.url
|
||||
|
||||
# submit_mcsm globals
|
||||
homedir = os.path.expanduser('~')
|
||||
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/mcsm')
|
||||
gene_match = gene + '_p.'
|
||||
|
||||
#============
|
||||
# directories
|
||||
#============
|
||||
if not datadir:
|
||||
datadir = homedir + '/git/Data/'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + drug + 'input/'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + drug + 'output/'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
if pdb_filename:
|
||||
in_filename_pdb = pdb_filename
|
||||
else:
|
||||
in_filename_pdb = gene.lower() + '_complex.pdb'
|
||||
|
||||
infile_pdb = indir + in_filename_pdb
|
||||
|
||||
#in_filename_snps = gene.lower() + '_mcsm_snps.csv' #(outfile_mcsm_snps, from data_extraction.py)
|
||||
#infile_snps = outdir + '/' + in_filename_snps
|
||||
|
||||
if mutation_filename:
|
||||
in_filename_snps = mutation_filename
|
||||
else:
|
||||
in_filename_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
|
||||
infile_snps = outdir + in_filename_snps
|
||||
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
# mcsm_results globals
|
||||
if not result_urls:
|
||||
result_urls_filename = gene.lower() + '_result_urls.txt'
|
||||
result_urls = outdir + result_urls_filename
|
||||
if DEBUG:
|
||||
print('DEBUG: Result URLs:', result_urls)
|
||||
|
||||
if not mcsm_output:
|
||||
mcsm_output_filename = gene.lower() + '_mcsm_output.csv'
|
||||
mcsm_output = outdir + mcsm_output_filename
|
||||
if DEBUG:
|
||||
print('DEBUG: mCSM output CSV file:', mcsm_output)
|
||||
|
||||
# format_results globals
|
||||
#out_filename_format = gene.lower() + '_mcsm_processed.csv'
|
||||
if not outfile_format:
|
||||
out_filename_format = gene.lower() + '_complex_mcsm_norm.csv'
|
||||
outfile_format = outdir + out_filename_format
|
||||
if DEBUG:
|
||||
print('DEBUG: formatted CSV output:', outfile_format)
|
||||
#%%=====================================================================
|
||||
def submit_mcsm():
|
||||
# Example:
|
||||
# chain = 'A'
|
||||
# ligand_id = 'RMP'
|
||||
# affinity = 10
|
||||
|
||||
print('Result urls and error file (if any) will be written in: ', outdir)
|
||||
|
||||
# call function to format data to remove duplicate snps before submitting job
|
||||
mcsm_muts = format_data(infile_snps)
|
||||
mut_count = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||
infile_snps_len = os.popen('wc -l < %s' % infile_snps).read() # quicker than using Python :-)
|
||||
print('Total SNPs for', gene, ':', infile_snps_len)
|
||||
for mcsm_mut in mcsm_muts:
|
||||
print('Processing mutation: %s of %s' % (mut_count, infile_snps_len), mcsm_mut)
|
||||
if DEBUG:
|
||||
print('DEBUG: Parameters for mcsm_lig:', in_filename_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene)
|
||||
# function call: to request mcsm prediction
|
||||
# which writes file containing url for valid submissions and invalid muts to respective files
|
||||
holding_page = request_calculation(infile_pdb, mcsm_mut, chain, ligand, affinity, prediction_url, outdir, gene, host)
|
||||
time.sleep(1)
|
||||
mut_count += 1
|
||||
# result_url = write_result_url(holding_page, result_urls, host)
|
||||
|
||||
print('Request submitted'
|
||||
, '\nCAUTION: Processing will take at least ten'
|
||||
, 'minutes, but will be longer for more mutations.')
|
||||
#%%=====================================================================
|
||||
def get_results():
|
||||
output_df = pd.DataFrame()
|
||||
url_counter = 1 # HURR DURR COUNT STARTEDS AT ONE1`!1
|
||||
success_counter = 1
|
||||
infile_len = os.popen('wc -l < %s' % result_urls).read() # quicker than using Python :-)
|
||||
|
||||
print('Total URLs:', infile_len)
|
||||
|
||||
with open(result_urls, 'r') as urlfile:
|
||||
for line in urlfile:
|
||||
url_line = line.strip()
|
||||
# call functions
|
||||
results_interim = scrape_results(url_line)
|
||||
if results_interim is not None:
|
||||
print('Processing URL: %s of %s' % (url_counter, infile_len))
|
||||
result_dict = build_result_dict(results_interim)
|
||||
df = pd.DataFrame(result_dict, index=[url_counter])
|
||||
output_df = output_df.append(df)
|
||||
success_counter += 1
|
||||
url_counter += 1
|
||||
|
||||
print('Total URLs: %s Successful: %s Failed: %s' % (url_counter-1, success_counter-1, (url_counter - success_counter)))
|
||||
#print('\nOutput file created:', output_dir + gene.lower() + '_mcsm_output.csv')
|
||||
output_df.to_csv(mcsm_output, index = None, header = True)
|
||||
#%%=====================================================================
|
||||
def format_results():
|
||||
print('Input file:', mcsm_output
|
||||
, '\n============================================================='
|
||||
, '\nOutput file:', outfile_format
|
||||
, '\n=============================================================')
|
||||
|
||||
# call function
|
||||
mcsm_df_formatted = format_mcsm_output(mcsm_output)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_df_formatted.to_csv(outfile_format, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_format
|
||||
, '\nExpected no. of rows:', len(mcsm_df_formatted)
|
||||
, '\nExpected no. of cols:', len(mcsm_df_formatted.columns)
|
||||
, '\n=============================================================')
|
||||
#%%=====================================================================
|
||||
def main():
|
||||
if stage == 'submit':
|
||||
print('mCSM stage: submit mutations for mcsm analysis')
|
||||
submit_mcsm()
|
||||
elif stage == 'get':
|
||||
print('mCSM stage: get results')
|
||||
get_results()
|
||||
elif stage == 'format':
|
||||
print('mCSM stage: format results')
|
||||
format_results()
|
||||
else:
|
||||
print('ERROR: invalid stage')
|
||||
|
||||
main()
|
|
@ -1,512 +0,0 @@
|
|||
###########################
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
###########################
|
||||
# Data for bfactor figure
|
||||
# PS average
|
||||
# Lig average
|
||||
###########################
|
||||
head(my_df$Position)
|
||||
head(my_df$ratioDUET)
|
||||
# order data frame
|
||||
df = my_df[order(my_df$Position),]
|
||||
head(df$Position)
|
||||
head(df$ratioDUET)
|
||||
#***********
|
||||
# PS: average by position
|
||||
#***********
|
||||
mean_DUET_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.DUET = mean(ratioDUET))
|
||||
#***********
|
||||
# Lig: average by position
|
||||
#***********
|
||||
mean_Lig_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.Lig = mean(ratioPredAff))
|
||||
#***********
|
||||
# cbind:mean_DUET_by_position and mean_Lig_by_position
|
||||
#***********
|
||||
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
|
||||
# sanity check
|
||||
# mean_PS_Lig_Bfactor
|
||||
colnames(combined)
|
||||
colnames(combined) = c("Position"
|
||||
, "average_DUETR"
|
||||
, "Position2"
|
||||
, "average_PredAffR")
|
||||
colnames(combined)
|
||||
identical(combined$Position, combined$Position2)
|
||||
n = which(colnames(combined) == "Position2"); n
|
||||
combined_df = combined[,-n]
|
||||
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
|
||||
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
|
||||
#=============
|
||||
# output csv
|
||||
#============
|
||||
outDir = "~/Data/pyrazinamide/input/processed/"
|
||||
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
|
||||
print(paste0("Output file with path will be:","", outFile))
|
||||
head(combined_df$Position); tail(combined_df$Position)
|
||||
write.csv(combined_df, outFile
|
||||
, row.names = F)
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(data.table)
|
||||
require(dplyr)
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
source("../combining_two_df.R")
|
||||
###########################
|
||||
# This will return:
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
###########################
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
###########################
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
###########################
|
||||
# Data for bfactor figure
|
||||
# PS average
|
||||
# Lig average
|
||||
###########################
|
||||
head(my_df$Position)
|
||||
head(my_df$ratioDUET)
|
||||
# order data frame
|
||||
df = my_df[order(my_df$Position),]
|
||||
head(df$Position)
|
||||
head(df$ratioDUET)
|
||||
#***********
|
||||
# PS: average by position
|
||||
#***********
|
||||
mean_DUET_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.DUET = mean(ratioDUET))
|
||||
#***********
|
||||
# Lig: average by position
|
||||
#***********
|
||||
mean_Lig_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.Lig = mean(ratioPredAff))
|
||||
#***********
|
||||
# cbind:mean_DUET_by_position and mean_Lig_by_position
|
||||
#***********
|
||||
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
|
||||
# sanity check
|
||||
# mean_PS_Lig_Bfactor
|
||||
colnames(combined)
|
||||
colnames(combined) = c("Position"
|
||||
, "average_DUETR"
|
||||
, "Position2"
|
||||
, "average_PredAffR")
|
||||
colnames(combined)
|
||||
identical(combined$Position, combined$Position2)
|
||||
n = which(colnames(combined) == "Position2"); n
|
||||
combined_df = combined[,-n]
|
||||
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
|
||||
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
|
||||
#=============
|
||||
# output csv
|
||||
#============
|
||||
outDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
|
||||
print(paste0("Output file with path will be:","", outFile))
|
||||
head(combined_df$Position); tail(combined_df$Position)
|
||||
write.csv(combined_df, outFile
|
||||
, row.names = F)
|
||||
# read in pdb file complex1
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
# read in pdb file complex1
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
#########################
|
||||
#3: Read complex pdb file
|
||||
##########################
|
||||
source("Header_TT.R")
|
||||
# list of 8
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
rm(inDir, inFile)
|
||||
#====== end of script
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
complex1 = inFile
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
inFile
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
|
||||
#complex2 = inFile2
|
||||
# list of 8
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
rm(inDir, inFile, complex1)
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
source("Header_TT.R")
|
||||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
source("Header_TT.R")
|
||||
#########################################################
|
||||
# TASK: replace B-factors in the pdb file with normalised values
|
||||
# use the complex file with no water as mCSM lig was
|
||||
# performed on this file. You can check it in the script: read_pdb file.
|
||||
#########################################################
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
source("read_pdb.R") # list of 8
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
#1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: DUET scores
|
||||
hist(my_df$average_DUETR
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
plot(density(my_df$average_DUETR)
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
#1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: DUET scores
|
||||
hist(my_df$average_DUETR
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
plot(density(my_df$average_DUETR)
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
#=========
|
||||
# step 1_P1
|
||||
#=========
|
||||
# Be brave and replace in place now (don't run sanity check)
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
|
||||
#=========
|
||||
# step 2_P1
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
# sanity check: should be True
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
max(d$b); min(d$b)
|
||||
# sanity checks: should be True
|
||||
if(max(d$b) == max(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
if (min(d$b) == min(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
#=========
|
||||
# step 3_P1
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
#=========
|
||||
# step 4_P1
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
max(d$b); min(d$b)
|
||||
#=========
|
||||
# step 5_P1
|
||||
#=========
|
||||
# output dir
|
||||
getwd()
|
||||
outDir = "~/git/Data/pyrazinamide/output/"
|
||||
getwd()
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb")
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure"
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "repalced-B")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "replaced-B")
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
mtext(text = "DUET_stability"
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B values with PredAff Scores
|
||||
#=========================================================
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B values with PredAff Scores
|
||||
#=========================================================
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
my_df <- read.csv("../Data/mean_PS_Lig_Bfactor.csv"
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B factor with mean ratioLig scores
|
||||
#=========================================================
|
||||
#########################
|
||||
# 3: Read complex pdb file
|
||||
# form the R script
|
||||
##########################
|
||||
source("read_pdb.R") # list of 8
|
||||
# extract atom list into a vari
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
# 1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: Pred Aff scores
|
||||
hist(my_df$average_PredAffR
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
plot(density(my_df$average_PredAffR)
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
# 1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
# 2: Pred Aff scores
|
||||
hist(my_df$average_PredAffR
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
plot(density(my_df$average_PredAffR)
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
#=========
|
||||
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
|
||||
#=========
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
|
||||
#=========
|
||||
# step 2_P2
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
max(d$b); min(d$b)
|
||||
# sanity checks: should be True
|
||||
if (max(d$b) == max(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
if (min(d$b) == min(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
#=========
|
||||
# step 3_P2
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
#=========
|
||||
# step 4_P2
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
max(d$b); min(d$b)
|
||||
#=========
|
||||
# step 5_P2
|
||||
#=========
|
||||
write.pdb(my_pdb, "Plotting/structure/complex1_BwithNormLIG.pdb")
|
||||
# output dir
|
||||
getwd()
|
||||
# output dir
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb")
|
||||
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
|
@ -1,299 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# TASK: To combine mcsm and meta data with af and or
|
||||
#########################################################
|
||||
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
#require(data.table)
|
||||
#require(arsenal)
|
||||
#require(compare)
|
||||
#library(tidyverse)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data = read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
rm(inDir, inFile)
|
||||
|
||||
str(mcsm_data)
|
||||
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
|
||||
# spelling Correction 1: DUET
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks: should be the same as above
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
|
||||
|
||||
# spelling Correction 2: Ligand
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks: should be the same as above
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
|
||||
|
||||
# sort by Mutationinformation
|
||||
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data$Mutationinformation)
|
||||
|
||||
# get freq count of positions and add to the df
|
||||
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
|
||||
|
||||
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
|
||||
|
||||
###########################
|
||||
# 2: Read file: meta data with AFandOR
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
|
||||
|
||||
meta_with_afor <- read.csv(inFile2
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
rm(inDir, inFile2)
|
||||
|
||||
str(meta_with_afor)
|
||||
|
||||
# sort by Mutationinformation
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
# sanity check: should be True for all the mentioned columns
|
||||
#is.numeric(meta_with_afor$OR)
|
||||
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
|
||||
|
||||
c1 = NULL
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
c0 = is.numeric(meta_with_afor[,i])
|
||||
c1 = c(c0, c1)
|
||||
if ( all(c1) ){
|
||||
print("Sanity check passed: These are all numeric cols")
|
||||
} else{
|
||||
print("Error: Please check your respective data types")
|
||||
}
|
||||
}
|
||||
|
||||
# If OR, and P value are not numeric, then convert to numeric and then count
|
||||
# else they will say 0
|
||||
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
|
||||
str(na_count)
|
||||
|
||||
# compare if the No of "NA" are the same for all these cols
|
||||
na_len = NULL
|
||||
for (i in na_var){
|
||||
temp = na_count[[i]]
|
||||
na_len = c(na_len, temp)
|
||||
}
|
||||
|
||||
# extract how many NAs:
|
||||
# should be all TRUE
|
||||
# should be a single number since
|
||||
# all the cols should have "equal" and "same" no. of NAs
|
||||
|
||||
my_nrows = NULL
|
||||
for ( i in 1: (length(na_len)-1) ){
|
||||
#print(compare(na_len[i]), na_len[i+1])
|
||||
c = compare(na_len[i], na_len[i+1])
|
||||
if ( c$result ) {
|
||||
my_nrows = na_len[i] }
|
||||
else {
|
||||
print("Error: Please check your numbers")
|
||||
}
|
||||
}
|
||||
|
||||
my_nrows
|
||||
|
||||
#=#=#=#=#=#=#=#=#
|
||||
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
|
||||
# these are the same 7 ones
|
||||
#=#=#=#=#=#=#=#=#
|
||||
|
||||
# sanity check
|
||||
#which(is.na(meta_with_afor$OR))
|
||||
|
||||
# initialise an empty df with nrows as extracted above
|
||||
na_count_df = data.frame(matrix(vector(mode = 'numeric'
|
||||
# , length = length(na_var)
|
||||
)
|
||||
, nrow = my_nrows
|
||||
# , ncol = length(na_var)
|
||||
))
|
||||
|
||||
# populate the df with the indices of the cols that are NA
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
na_i = which(is.na(meta_with_afor[i]))
|
||||
na_count_df = cbind(na_count_df, na_i)
|
||||
colnames(na_count_df)[which(na_var == i)] <- i
|
||||
}
|
||||
|
||||
# Now compare these indices to ensure these are the same
|
||||
c2 = NULL
|
||||
for ( i in 1: ( length(na_count_df)-1 ) ) {
|
||||
# print(na_count_df[i] == na_count_df[i+1])
|
||||
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
|
||||
c2 = c(c1, c2)
|
||||
if ( all(c2) ) {
|
||||
print("Sanity check passed: The indices for AF, OR, etc are all the same")
|
||||
} else {
|
||||
print ("Error: Please check indices which are NA")
|
||||
}
|
||||
}
|
||||
|
||||
rm( c, c0, c1, c2, i, my_nrows
|
||||
, na_count, na_i, na_len
|
||||
, na_var, temp
|
||||
, na_count_df
|
||||
, pos_count_check )
|
||||
|
||||
###########################
|
||||
# 3:merging two dfs: with NA
|
||||
###########################
|
||||
|
||||
# link col name = Mutationinforamtion
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
#########
|
||||
# merge 1a: meta data with mcsm
|
||||
#########
|
||||
merged_df2 = merge(x = meta_with_afor
|
||||
,y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.y = T)
|
||||
|
||||
head(merged_df2$Position)
|
||||
|
||||
# sort by Position
|
||||
head(merged_df2$Position)
|
||||
merged_df2 = merged_df2[order(merged_df2$Position),]
|
||||
head(merged_df2$Position)
|
||||
|
||||
merged_df2v2 = merge(x = meta_with_afor
|
||||
,y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.x = T)
|
||||
#!=!=!=!=!=!=!=!
|
||||
# COMMENT: used all.y since position 186 is not part of the struc,
|
||||
# hence doesn't have a mcsm value
|
||||
# but 186 is associated with with mutation
|
||||
#!=!=!=!=!=!=!=!
|
||||
|
||||
# should be False
|
||||
identical(merged_df2, merged_df2v2)
|
||||
table(merged_df2$Position%in%merged_df2v2$Position)
|
||||
|
||||
rm(merged_df2v2)
|
||||
|
||||
#########
|
||||
# merge 1b:remove duplicate mutation information
|
||||
#########
|
||||
|
||||
#==#=#=#=#=#=#
|
||||
# Cannot trust lineage, country from this df as the same mutation
|
||||
# can have many different lineages
|
||||
# but this should be good for the numerical corr plots
|
||||
#=#=#=#=#=#=#=
|
||||
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
||||
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
|
||||
|
||||
# sanity checks
|
||||
# nrows of merged_df3 should be the same as the nrows of mcsm_data
|
||||
if(nrow(mcsm_data) == nrow(merged_df3)){
|
||||
print("sanity check: Passed")
|
||||
} else {
|
||||
print("Error!: check data, nrows is not as expected")
|
||||
}
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# uncomment as necessary
|
||||
# only need to run this if merged_df2v2 i.e non structural pos included
|
||||
#mcsm = mcsm_data$Mutationinformation
|
||||
#my_merged = merged_df3$Mutationinformation
|
||||
|
||||
# find the index where it differs
|
||||
#diff_n = which(!my_merged%in%mcsm)
|
||||
|
||||
#check if it is indeed pos 186
|
||||
#merged_df3[diff_n,]
|
||||
|
||||
# remove this entry
|
||||
#merged_df3 = merged_df3[-diff_n,]]
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
###########################
|
||||
# 3b :merging two dfs: without NA
|
||||
###########################
|
||||
|
||||
#########
|
||||
# merge 2a:same as merge 1 but excluding NA
|
||||
#########
|
||||
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
|
||||
|
||||
#########
|
||||
# merge 2b: remove duplicate mutation information
|
||||
#########
|
||||
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
|
||||
|
||||
# alternate way of deriving merged_df3_comp
|
||||
foo = merged_df3[!is.na(merged_df3$AF),]
|
||||
# compare dfs: foo and merged_df3_com
|
||||
all.equal(foo, merged_df3)
|
||||
|
||||
summary(comparedf(foo, merged_df3))
|
||||
|
||||
#=============== end of combining df
|
||||
#clear variables
|
||||
rm(mcsm_data
|
||||
, meta_with_afor
|
||||
, foo)
|
||||
|
||||
#rm(diff_n, my_merged, mcsm)
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
# output dir
|
||||
outDir = "~/git/Data/pyrazinamide/output/"
|
||||
getwd()
|
||||
|
||||
outFile1 = paste0(outDir, "merged_df3.csv"); outFile1
|
||||
write.csv(merged_df3, outFile1)
|
||||
|
||||
#outFile2 = paste0(outDir, "merged_df3_comp.csv"); outFile2
|
||||
#write.csv(merged_df3_comp, outFile2)
|
||||
|
||||
rm(outDir
|
||||
, outFile1
|
||||
# , outFile2
|
||||
)
|
||||
#============================= end of script
|
||||
|
|
@ -1,348 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# TASK: To combine mcsm and meta data with af and or
|
||||
# by filtering for distance to ligand (<10Ang)
|
||||
#########################################################
|
||||
|
||||
#########################################################
|
||||
# Installing and loading required packages
|
||||
#########################################################
|
||||
|
||||
#source("Header_TT.R")
|
||||
#require(data.table)
|
||||
#require(arsenal)
|
||||
#require(compare)
|
||||
#library(tidyverse)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data = read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
rm(inDir, inFile)
|
||||
|
||||
str(mcsm_data)
|
||||
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
|
||||
# spelling Correction 1: DUET
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks
|
||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
||||
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
|
||||
|
||||
# spelling Correction 2: Ligand
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
# checks: should be the same as above
|
||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
||||
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
|
||||
|
||||
########################### !!! only for mcsm_lig
|
||||
# 4: Filter/subset data
|
||||
# Lig plots < 10Ang
|
||||
# Filter the lig plots for Dis_to_lig < 10Ang
|
||||
###########################
|
||||
|
||||
# check range of distances
|
||||
max(mcsm_data$Dis_lig_Ang)
|
||||
min(mcsm_data$Dis_lig_Ang)
|
||||
|
||||
# count
|
||||
table(mcsm_data$Dis_lig_Ang<10)
|
||||
|
||||
# subset data to have only values less than 10 Ang
|
||||
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
||||
|
||||
# sanity checks
|
||||
max(mcsm_data2$Dis_lig_Ang)
|
||||
min(mcsm_data2$Dis_lig_Ang)
|
||||
|
||||
# count no of unique positions
|
||||
length(unique(mcsm_data2$Position))
|
||||
|
||||
# count no of unique mutations
|
||||
length(unique(mcsm_data2$Mutationinformation))
|
||||
|
||||
# count Destabilisinga and stabilising
|
||||
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT: so as not to alter the script
|
||||
mcsm_data = mcsm_data2
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(mcsm_data$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
# clear variables
|
||||
rm(mcsm_data2)
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
|
||||
|
||||
head(mcsm_data$Mutationinformation)
|
||||
mcsm_data[mcsm_data$Mutationinformation=="Q10P",]
|
||||
mcsm_data[mcsm_data$Mutationinformation=="L4S",]
|
||||
|
||||
# sort by Mutationinformation
|
||||
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data$Mutationinformation)
|
||||
|
||||
# check
|
||||
mcsm_data[grep("Q10P", mcsm_data$Mutationinformation),]
|
||||
mcsm_data[grep("A102T", mcsm_data$Mutationinformation),]
|
||||
|
||||
# get freq count of positions and add to the df
|
||||
setDT(mcsm_data)[, occurrence := .N, by = .(Position)]
|
||||
|
||||
pos_count_check = data.frame(mcsm_data$Position, mcsm_data$occurrence)
|
||||
|
||||
###########################
|
||||
# 2: Read file: meta data with AFandOR
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile2 = paste0(inDir, "meta_data_with_AFandOR.csv"); inFile2
|
||||
|
||||
meta_with_afor <- read.csv(inFile2
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
str(meta_with_afor)
|
||||
|
||||
# sort by Mutationinformation
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
# sanity check: should be True for all the mentioned columns
|
||||
#is.numeric(meta_with_afor$OR)
|
||||
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
|
||||
|
||||
c1 = NULL
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
c0 = is.numeric(meta_with_afor[,i])
|
||||
c1 = c(c0, c1)
|
||||
if ( all(c1) ){
|
||||
print("Sanity check passed: These are all numeric cols")
|
||||
} else{
|
||||
print("Error: Please check your respective data types")
|
||||
}
|
||||
}
|
||||
|
||||
# If OR, and P value are not numeric, then convert to numeric and then count
|
||||
# else they will say 0
|
||||
|
||||
# NOW count na in each column: if you did it before, then
|
||||
# OR and Pvalue column would say 0 na since these were not numeric
|
||||
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
|
||||
str(na_count)
|
||||
|
||||
# compare if the No of "NA" are the same for all these cols
|
||||
na_len = NULL
|
||||
na_var = c("AF", "OR", "pvalue", "logor", "neglog10pvalue")
|
||||
for (i in na_var){
|
||||
temp = na_count[[i]]
|
||||
na_len = c(na_len, temp)
|
||||
}
|
||||
|
||||
my_nrows = NULL
|
||||
|
||||
for ( i in 1: (length(na_len)-1) ){
|
||||
#print(compare(na_len[i]), na_len[i+1])
|
||||
c = compare(na_len[i], na_len[i+1])
|
||||
if ( c$result ) {
|
||||
my_nrows = na_len[i] }
|
||||
else {
|
||||
print("Error: Please check your numbers")
|
||||
}
|
||||
}
|
||||
|
||||
my_nrows
|
||||
|
||||
#=#=#=#=#=#=#=#=#
|
||||
# COMMENT: AF, OR, pvalue, logor and neglog10pvalue
|
||||
# all have 81 NA, with pyrazinamide with 960
|
||||
# and these are the same 7 ones
|
||||
#=#=#=#=#=#=#=#=#
|
||||
|
||||
# sanity check
|
||||
#which(is.na(meta_with_afor$OR))
|
||||
|
||||
# initialise an empty df with nrows as extracted above
|
||||
na_count_df = data.frame(matrix(vector(mode = 'numeric'
|
||||
# , length = length(na_var)
|
||||
)
|
||||
, nrow = my_nrows
|
||||
# , ncol = length(na_var)
|
||||
))
|
||||
|
||||
# populate the df with the indices of the cols that are NA
|
||||
for (i in na_var){
|
||||
print(i)
|
||||
na_i = which(is.na(meta_with_afor[i]))
|
||||
na_count_df = cbind(na_count_df, na_i)
|
||||
colnames(na_count_df)[which(na_var == i)] <- i
|
||||
}
|
||||
|
||||
# Now compare these indices to ensure these are the same
|
||||
c2 = NULL
|
||||
for ( i in 1: ( length(na_count_df)-1 ) ) {
|
||||
# print(na_count_df[i] == na_count_df[i+1])
|
||||
c1 = identical(na_count_df[[i]], na_count_df[[i+1]])
|
||||
c2 = c(c1, c2)
|
||||
if ( all(c2) ) {
|
||||
print("Sanity check passed: The indices for AF, OR, etc are all the same")
|
||||
} else {
|
||||
print ("Error: Please check indices which are NA")
|
||||
}
|
||||
}
|
||||
|
||||
rm( c, c1, c2, i, my_nrows
|
||||
, na_count, na_i, na_len
|
||||
, na_var, temp
|
||||
, na_count_df
|
||||
, pos_count_check )
|
||||
|
||||
###########################
|
||||
# 3:merging two dfs: with NA
|
||||
###########################
|
||||
|
||||
# link col name = Mutationinforamtion
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(meta_with_afor$Mutationinformation)
|
||||
|
||||
#########
|
||||
# merge 1a: meta data with mcsm
|
||||
#########
|
||||
merged_df2 = merge(x = meta_with_afor
|
||||
, y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.y = T)
|
||||
|
||||
head(merged_df2$Position)
|
||||
|
||||
# sort by Position
|
||||
head(merged_df2$Position)
|
||||
merged_df2 = merged_df2[order(merged_df2$Position),]
|
||||
head(merged_df2$Position)
|
||||
|
||||
merged_df2v2 = merge(x = meta_with_afor
|
||||
,y = mcsm_data
|
||||
, by = "Mutationinformation"
|
||||
, all.x = T)
|
||||
|
||||
#!=!=!=!=!=!=!=!
|
||||
# COMMENT: used all.y since position 186 is not part of the struc,
|
||||
# hence doesn't have a mcsm value
|
||||
# but 186 is associated with with mutation
|
||||
#!=!=!=!=!=!=!=!
|
||||
|
||||
# should be False
|
||||
identical(merged_df2, merged_df2v2)
|
||||
table(merged_df2$Position%in%merged_df2v2$Position)
|
||||
|
||||
rm(merged_df2v2)
|
||||
|
||||
#########
|
||||
# merge 1b:remove duplicate mutation information
|
||||
#########
|
||||
|
||||
#==#=#=#=#=#=#
|
||||
# Cannot trust lineage, country from this df as the same mutation
|
||||
# can have many different lineages
|
||||
# but this should be good for the numerical corr plots
|
||||
#=#=#=#=#=#=#=
|
||||
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
||||
head(merged_df3$Position) ; tail(merged_df3$Position) # should be sorted
|
||||
|
||||
# sanity checks
|
||||
# nrows of merged_df3 should be the same as the nrows of mcsm_data
|
||||
if(nrow(mcsm_data) == nrow(merged_df3)){
|
||||
print("sanity check: Passed")
|
||||
} else {
|
||||
print("Error!: check data, nrows is not as expected")
|
||||
}
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# uncomment as necessary
|
||||
# only need to run this if merged_df2v2 i.e non structural pos included
|
||||
#mcsm = mcsm_data$Mutationinformation
|
||||
#my_merged = merged_df3$Mutationinformation
|
||||
|
||||
# find the index where it differs
|
||||
#diff_n = which(!my_merged%in%mcsm)
|
||||
|
||||
#check if it is indeed pos 186
|
||||
#merged_df3[diff_n,]
|
||||
|
||||
# remove this entry
|
||||
#merged_df3 = merged_df3[-diff_n,]
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
###########################
|
||||
# 3b :merging two dfs: without NA
|
||||
###########################
|
||||
|
||||
#########
|
||||
# merge 2a:same as merge 1 but excluding NA
|
||||
#########
|
||||
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
|
||||
|
||||
#########
|
||||
# merge 2b: remove duplicate mutation information
|
||||
#########
|
||||
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
|
||||
|
||||
# FIXME: add this as a sanity check. I have manually checked!
|
||||
|
||||
# alternate way of deriving merged_df3_comp
|
||||
foo = merged_df3[!is.na(merged_df3$AF),]
|
||||
|
||||
# compare dfs: foo and merged_df3_com
|
||||
all.equal(foo, merged_df3)
|
||||
|
||||
summary(comparedf(foo, merged_df3))
|
||||
|
||||
#=============== end of combining df
|
||||
#clear variables
|
||||
rm(mcsm_data
|
||||
, meta_with_afor
|
||||
, foo)
|
||||
|
||||
#rm(diff_n, my_merged, mcsm)
|
||||
|
||||
#===============end of script
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
|
||||
# Not required as this is a subset of the "combining_two_df.R" script
|
||||
|
|
@ -1,244 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jun 25 08:46:36 2019
|
||||
|
||||
@author: tanushree
|
||||
"""
|
||||
############################################
|
||||
# load libraries
|
||||
import os
|
||||
import pandas as pd
|
||||
from Bio import SeqIO
|
||||
############################################
|
||||
#********************************************************************
|
||||
# TASK: Read in fasta files and create mutant sequences akin to a MSA,
|
||||
# to allow generation of logo plots
|
||||
|
||||
# Requirements:
|
||||
# input: Fasta file of protein/target for which mut seqs will be created
|
||||
# path: "Data/<drug>/input/original/<filename>"
|
||||
# output: MSA for mutant sequences
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
#***********************************************************************
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
############# specify variables for input and output paths and filenames
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
|
||||
# input
|
||||
inpath = "/original"
|
||||
in_filename_fasta = "/3pl1.fasta.txt"
|
||||
infile_fasta = homedir + basedir + inpath + in_filename_fasta
|
||||
print("Input file is:", infile_fasta)
|
||||
|
||||
inpath_p = "/processed"
|
||||
in_filename_meta_data = "/meta_data_with_AFandOR.csv"
|
||||
infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
|
||||
print("Input file is:", infile_meta_data)
|
||||
|
||||
# output: only path specified, filenames in respective sections
|
||||
outpath = "/processed"
|
||||
|
||||
################## end of variable assignment for input and output files
|
||||
#==========
|
||||
#read files
|
||||
#==========
|
||||
#############
|
||||
#fasta file
|
||||
#############
|
||||
#my_file = infile_fasta
|
||||
|
||||
my_fasta = str()
|
||||
for seq_record in SeqIO.parse(infile_fasta, "fasta"):
|
||||
my_seq = seq_record.seq
|
||||
my_fasta = str(my_seq) #convert to a string
|
||||
print(my_fasta)
|
||||
# print( len(my_fasta) )
|
||||
# print( type(my_fasta) )
|
||||
|
||||
len(my_fasta)
|
||||
|
||||
#############
|
||||
# SNP info
|
||||
#############
|
||||
# read mutant_info file and extract cols with positions and mutant_info
|
||||
# This should be all samples with pncA muts
|
||||
#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
|
||||
#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
|
||||
my_data = pd.read_csv(infile_meta_data) #3093, 22
|
||||
list(my_data.columns)
|
||||
|
||||
#FIXME: You need a better way to identify this
|
||||
# remove positions not in the structure
|
||||
#pos_remove = 186
|
||||
my_data = my_data[my_data.position != 186] #3092, 22
|
||||
|
||||
# if multiple positions, then try the example below;
|
||||
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
|
||||
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
|
||||
|
||||
#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
|
||||
mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
###############
|
||||
# data cleaning
|
||||
################
|
||||
# extract only those positions that have a frequency count of pos>1
|
||||
###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
|
||||
|
||||
# add a column of frequency for each position
|
||||
#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
|
||||
mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
|
||||
|
||||
# sort by position
|
||||
mut_info2 = mut_info1.sort_values(by=['position'])
|
||||
|
||||
#FIXME
|
||||
#__main__:1: SettingWithCopyWarning:
|
||||
#A value is trying to be set on a copy of a slice from a DataFrame.
|
||||
#Try using .loc[row_indexer,col_indexer] = value instead
|
||||
|
||||
#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
|
||||
|
||||
#sort dataframe by freq values so the row indices are in order!
|
||||
#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
|
||||
# , axis = 0
|
||||
# , ascending = False
|
||||
# , inplace = False
|
||||
# , na_position = 'last')
|
||||
|
||||
#mut_info2 = mut_info2.reset_index( drop = True)
|
||||
|
||||
|
||||
# count how many pos have freq 1 as you will need to exclude those
|
||||
mut_info2[mut_info2.freq_pos == 1].sum() #20
|
||||
|
||||
# extract entries with freq_pos>1
|
||||
# should be 3093-211 = 3072
|
||||
mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
|
||||
|
||||
# reset index to allow iteration <<<<<<<< IMPORTANT
|
||||
mut_info = mut_info3.reset_index(drop = True)
|
||||
|
||||
del(mut_info1, mut_info2, mut_info3, my_data)
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
###################
|
||||
# generate mut seqs
|
||||
###################
|
||||
mut_seqsL = [] * len(mut_info)
|
||||
|
||||
# iterate
|
||||
for i, pos in enumerate(mut_info['position']):
|
||||
print('index:', i, 'position:', pos)
|
||||
mut = mut_info['mutant_type'][i]
|
||||
# print(mut)
|
||||
# print( type(mut) )
|
||||
print('index:', i, 'position:', pos, 'mutant', mut)
|
||||
|
||||
my_fastaL = list(my_fasta)
|
||||
offset_pos = pos-1 #due to counting starting from 0
|
||||
my_fastaL[offset_pos] = mut
|
||||
# print(my_fastaL)
|
||||
mut_seq = "".join(my_fastaL)
|
||||
# print(mut_seq + '\n')
|
||||
mut_seqsL.append(mut_seq)
|
||||
# print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut, mut_seq)
|
||||
|
||||
###############
|
||||
# sanity check
|
||||
################
|
||||
len_orig = len(my_fasta)
|
||||
# checking if all the mutant sequences have the same length as the original fasta file sequence
|
||||
for seqs in mut_seqsL:
|
||||
# print(seqs)
|
||||
# print(len(seqs))
|
||||
if len(seqs) != len_orig:
|
||||
print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
|
||||
else:
|
||||
print('**Hooray** Length of mutant and original sequences match')
|
||||
|
||||
del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)
|
||||
|
||||
############
|
||||
# write file
|
||||
############
|
||||
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
|
||||
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
|
||||
|
||||
print(outpath)
|
||||
out_filename_gene = "/gene_msa.txt"
|
||||
outfile_gene = homedir + basedir + outpath + out_filename_gene
|
||||
print("Output file is:", outfile_gene)
|
||||
|
||||
with open(outfile_gene, 'w') as file_handler:
|
||||
for item in mut_seqsL:
|
||||
file_handler.write("{}\n".format(item))
|
||||
|
||||
R="\n".join(mut_seqsL)
|
||||
f = open('Columns.csv','w')
|
||||
f.write(R)
|
||||
f.close()
|
||||
|
||||
|
||||
#################################################################################
|
||||
# extracting only positions with SNPs so that when you plot only those positions
|
||||
################################################################################
|
||||
#mut_seqsL = mut_seqsL[:3] #just trying with 3 seqs
|
||||
|
||||
# create a list of unique positions
|
||||
pos = mut_info['position'] #3072
|
||||
posL = list(set(list(pos))) #110
|
||||
del(pos)
|
||||
|
||||
snp_seqsL = [] * len(mut_seqsL)
|
||||
|
||||
for j, mut_seq in enumerate(mut_seqsL):
|
||||
print (j, mut_seq)
|
||||
# print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
|
||||
mut_seqsE = list(mut_seq)
|
||||
# extract specific posistions (corres to SNPs) from list of mutant sequences
|
||||
snp_seqL1 = [mut_seqsE[i-1] for i in posL] #should be 110
|
||||
# print(snp_seqL1)
|
||||
# print(len(snp_seqL1))
|
||||
snp_seq_clean = "".join(snp_seqL1)
|
||||
snp_seqsL.append(snp_seq_clean)
|
||||
|
||||
###############
|
||||
# sanity check
|
||||
################
|
||||
no_unique_snps = len(posL)
|
||||
|
||||
# checking if all the mutant sequences have the same length as the original fasta file sequence
|
||||
for seqs in snp_seqsL:
|
||||
# print(seqs)
|
||||
# print(len(seqs))
|
||||
if len(seqs) != no_unique_snps:
|
||||
print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
|
||||
else:
|
||||
print('**Hooray** Length of mutant and original sequences match')
|
||||
|
||||
del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
|
||||
|
||||
|
||||
############
|
||||
# write file
|
||||
############
|
||||
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
|
||||
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
|
||||
|
||||
print(outpath)
|
||||
out_filename_snps = "/snps_msa.txt"
|
||||
outfile_snps = homedir + basedir + outpath + out_filename_snps
|
||||
print("Output file is:", outfile_snps)
|
||||
|
||||
with open(outfile_snps, 'w') as file_handler:
|
||||
for item in snp_seqsL:
|
||||
file_handler.write("{}\n".format(item))
|
||||
|
||||
R="\n".join(snp_seqsL)
|
||||
f = open('Columns.csv','w')
|
||||
f.write(R)
|
||||
f.close()
|
|
@ -1,9 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# run all bash scripts for mcsm
|
||||
|
||||
#./step0_check_duplicate_SNPs.sh
|
||||
#./step1_lig_output_urls.sh
|
||||
./step2_lig_results.sh
|
||||
./step3a_results_format_interim.sh
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#*************************************
|
||||
# need to be in the correct directory
|
||||
#*************************************
|
||||
##: comments for code
|
||||
#: commented out code
|
||||
|
||||
#**********************************************************************
|
||||
# TASK: Text file containing a list of SNPs; SNP in the format(C2E)
|
||||
# per line. Sort by unique, which automatically removes duplicates.
|
||||
# sace file in current directory
|
||||
#**********************************************************************
|
||||
infile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2.csv"
|
||||
outfile="${HOME}/git/Data/pyrazinamide/input/processed/pnca_mis_SNPs_v2_unique.csv"
|
||||
|
||||
# sort unique entries and output to current directory
|
||||
sort -u ${infile} > ${outfile}
|
||||
|
||||
# count no. of unique snps mCSM will run on
|
||||
count=$(wc -l < ${outfile})
|
||||
|
||||
# print to console no. of unique snps mCSM will run on
|
||||
echo "${count} unique mutations for mCSM to run on"
|
||||
|
|
@ -1,104 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#**********************************************************************
|
||||
# TASK: submit requests using curl: HANDLE redirects and refresh url.
|
||||
# Iterate over mutation file and write/append result urls to a file
|
||||
# Mutation file must have one mutation (format A1B) per line
|
||||
# Requirements
|
||||
# input: mutation list (format: A1B), complex struc: (pdb format)
|
||||
# mutation: outFile from step0, one unique mutation/line, no chain ID
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
# structure: pdb file of drug-target complex
|
||||
# path: "Data/<drug>/input/structure/<filename>"
|
||||
# output: should be n urls (n=no. of unique mutations in file)
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
|
||||
# NOTE: these are just result urls, not actual values for results
|
||||
#**********************************************************************
|
||||
############# specify variables for input and output paths and filenames
|
||||
homedir="${HOME}"
|
||||
#echo Home directory is ${homedir}
|
||||
basedir="/git/Data/pyrazinamide/input"
|
||||
|
||||
# input
|
||||
inpath_mut="/processed"
|
||||
in_filename_mut="/pnca_mis_SNPs_v2_unique.csv"
|
||||
infile_mut="${homedir}${basedir}${inpath_mut}${in_filename_mut}"
|
||||
echo Input Mut filename: ${infile_mut}
|
||||
|
||||
inpath_struc="/structure"
|
||||
in_filename_struc="/complex1_no_water.pdb"
|
||||
infile_struc="${homedir}${basedir}${inpath_struc}${in_filename_struc}"
|
||||
echo Input Struc filename: ${infile_struc}
|
||||
|
||||
# output
|
||||
outpath="/processed"
|
||||
out_filename="/complex1_result_url.txt"
|
||||
outfile="${homedir}${basedir}${outpath}${out_filename}"
|
||||
#echo Output filename: ${outfile}
|
||||
################## end of variable assignment for input and output files
|
||||
|
||||
# iterate over mutation file (infile_mut); line by line and
|
||||
# submit query using curl
|
||||
# some useful messages
|
||||
echo -n -e "Processing $(wc -l < ${infile_mut}) entries from ${infile_mut}\n"
|
||||
COUNT=0
|
||||
while read -r line; do
|
||||
((COUNT++))
|
||||
mutation="${line}"
|
||||
# echo "${mutation}"
|
||||
#pdb='../Data/complex1_no_water.pdb'
|
||||
pdb="${infile_struc}"
|
||||
mutation="${mutation}"
|
||||
chain="A"
|
||||
lig_id="PZA"
|
||||
affin_wt="0.99"
|
||||
host="http://biosig.unimelb.edu.au"
|
||||
call_url="/mcsm_lig/prediction"
|
||||
|
||||
#=========================================
|
||||
##html field_names names required for curl
|
||||
##complex_field:wild=@
|
||||
##mutation_field:mutation=@
|
||||
##chain_field:chain=@
|
||||
##ligand_field:lig_id@
|
||||
##energy_field:affin_wt
|
||||
#=========================================
|
||||
refresh_url=$(curl -L \
|
||||
-sS \
|
||||
-F "wild=@${pdb}" \
|
||||
-F "mutation=${mutation}" \
|
||||
-F "chain=${chain}" \
|
||||
-F "lig_id=${lig_id}" \
|
||||
-F "affin_wt=${affin_wt}" \
|
||||
${host}${call_url} | grep "http-equiv")
|
||||
|
||||
#echo Refresh URL: $refresh_url
|
||||
#echo Host+Refresh: ${host}${refresh_url}
|
||||
|
||||
# use regex to extract the relevant bit from the refresh url
|
||||
# regex:sed -r 's/.*(\/mcsm.*)".*$/\1/g'
|
||||
|
||||
# Now build: result url using host and refresh url and write the urls to a file
|
||||
result_url=$(echo $refresh_url | sed -r 's/.*(\/mcsm.*)".*$/\1/g')
|
||||
sleep 10
|
||||
|
||||
echo -e "${mutation} : processing entry ${COUNT}/$(wc -l < ${infile_mut})..."
|
||||
|
||||
# create output file with the added number of muts from file
|
||||
# after much thought, bad idea as less generic!
|
||||
#echo -e "${host}${result_url}" >> ../Results/$(wc -l < ${filename})_complex1_result_url.txt
|
||||
echo -e "${host}${result_url}" >> ${outfile}
|
||||
#echo -n '.'
|
||||
done < "${infile_mut}"
|
||||
|
||||
#FIXME: stop executing if error else these echo statements are misleading!
|
||||
echo
|
||||
echo Output filename: ${outfile}
|
||||
echo
|
||||
echo Number of urls saved: $(wc -l < ${infile_mut})
|
||||
echo
|
||||
echo "Processing Complete"
|
||||
|
||||
# end of submitting query, receiving result url and storing results url in a file
|
||||
|
|
@ -1,76 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#********************************************************************
|
||||
# TASK: submit result urls and fetch actual results using curl
|
||||
# Iterate over each result url from the output of step1 stored in processed/
|
||||
# Use curl to fetch results and extract relevant sections using hxtools
|
||||
# and store these in another file in processed/
|
||||
|
||||
# Requirements:
|
||||
# input: output of step1, file containing result urls
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
# output: name of the file where extracted results will be stored
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
|
||||
# Optional: can make these command line args you pass when calling script
|
||||
# by uncommenting code as indicated
|
||||
#*********************************************************************
|
||||
############################# uncomment: to make it command line args
|
||||
#if [ "$#" -ne 2 ]; then
|
||||
#if [ -Z $1 ]; then
|
||||
# echo "
|
||||
# Please provide both Input and Output files.
|
||||
|
||||
# Usage: batch_read_urls.sh INFILE OUTFILE
|
||||
# "
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
# First argument: Input File
|
||||
# Second argument: Output File
|
||||
#infile=$1
|
||||
#outfile=$2
|
||||
############################ end of code block to make command line args
|
||||
|
||||
############# specify variables for input and output paths and filenames
|
||||
homedir="${HOME}"
|
||||
#echo Home directory is ${homedir}
|
||||
basedir="/git/Data/pyrazinamide/input"
|
||||
|
||||
# input
|
||||
inpath="/processed"
|
||||
in_filename="/complex1_result_url.txt"
|
||||
infile="${homedir}${basedir}${inpath}${in_filename}"
|
||||
echo Input Mut filename: ${infile}
|
||||
|
||||
# output
|
||||
outpath="/processed"
|
||||
out_filename="/complex1_output_MASTER.txt"
|
||||
outfile="${homedir}${basedir}${outpath}${out_filename}"
|
||||
echo Output filename: ${outfile}
|
||||
################## end of variable assignment for input and output files
|
||||
|
||||
# Iterate over each result url, and extract results using hxtools
|
||||
# which nicely cleans and formats html
|
||||
echo -n "Processing $(wc -l < ${infile}) entries from ${infile}"
|
||||
echo
|
||||
COUNT=0
|
||||
while read -r line; do
|
||||
#COUNT=$(($COUNT+1))
|
||||
((COUNT++))
|
||||
curl --silent ${line} \
|
||||
| hxnormalize -x \
|
||||
| hxselect -c div.span4 \
|
||||
| hxselect -c div.well \
|
||||
| sed -r -e 's/<[^>]*>//g' \
|
||||
| sed -re 's/ +//g' \
|
||||
>> ${outfile}
|
||||
#| tee -a ${outfile}
|
||||
# echo -n '.'
|
||||
echo -e "Processing entry ${COUNT}/$(wc -l < ${infile})..."
|
||||
|
||||
done < "${infile}"
|
||||
|
||||
echo
|
||||
echo "Processing Complete"
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#********************************************************************
|
||||
# TASK: Intermediate results processing
|
||||
# output file has a convenient delimiter of ":" that can be used to
|
||||
# format the file into two columns (col1: field_desc and col2: values)
|
||||
# However the section "PredictedAffinityChange:...." and
|
||||
# "DUETstabilitychange:.." are split over multiple lines and
|
||||
# prevent this from happening. Additionally there are other empty lines
|
||||
# that need to be omiited. In order ensure these sections are not split
|
||||
# over multiple lines, this script is written.
|
||||
|
||||
# Requirements:
|
||||
# input: output of step2, file containing mcsm results as described above
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
# output: replaces file in place.
|
||||
# Therefore first create a copy of the input file
|
||||
# but rename it to remove the word "MASTER" and add the word "processed"
|
||||
# file format: .txt
|
||||
|
||||
# NOTE: This replaces the file in place!
|
||||
# the output is a txt file with no newlines and formatting
|
||||
# to have the following format "<colname><:><value>
|
||||
#***********************************************************************
|
||||
############# specify variables for input and output paths and filenames
|
||||
homedir="${HOME}"
|
||||
basedir="/git/Data/pyrazinamide/input"
|
||||
|
||||
inpath="/processed"
|
||||
|
||||
# Create input file: copy and rename output file of step2
|
||||
oldfile="${homedir}${basedir}${inpath}/complex1_output_MASTER.txt"
|
||||
newfile="${homedir}${basedir}${inpath}/complex1_output_processed.txt"
|
||||
cp $oldfile $newfile
|
||||
|
||||
echo Input filename is ${oldfile}
|
||||
echo
|
||||
echo Output i.e copied filename is ${newfile}
|
||||
|
||||
# output: No output perse
|
||||
# Replacement in place inside the copied file
|
||||
################## end of variable assignment for input and output files
|
||||
|
||||
#sed -i '/PredictedAffinityChange:/ { N; N; N; N; s/\n//g;}' ${newfile} \
|
||||
# | sed -i '/DUETstabilitychange:/ {x; N; N; s/\n//g; p;d;}' ${newfile}
|
||||
|
||||
# Outputs records separated by a newline, that look something like this:
|
||||
# PredictedAffinityChange:-2.2log(affinityfoldchange)-Destabilizing
|
||||
# Mutationinformation:
|
||||
# Wild-type:L
|
||||
# Position:4
|
||||
# Mutant-type:W
|
||||
# Chain:A
|
||||
# LigandID:PZA
|
||||
# Distancetoligand:15.911Å
|
||||
# DUETstabilitychange:-2.169Kcal/mol
|
||||
#
|
||||
# PredictedAffinityChange:-1.538log(affinityfoldchange)-Destabilizing
|
||||
# (...etc)
|
||||
|
||||
# This script brings everything in a convenient format for further processing in python.
|
||||
sed -i '/PredictedAffinityChange/ {
|
||||
N
|
||||
N
|
||||
N
|
||||
N
|
||||
s/\n//g
|
||||
}
|
||||
/DUETstabilitychange:/ {
|
||||
N
|
||||
N
|
||||
s/\n//g
|
||||
}
|
||||
/^$/d' ${newfile}
|
|
@ -1,63 +0,0 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
###################
|
||||
# load libraries
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
from collections import defaultdict
|
||||
####################
|
||||
|
||||
#********************************************************************
|
||||
# TASK: Formatting results with nice colnames
|
||||
# step3a processed the mcsm results to remove all newlines and
|
||||
# brought data in a format where the delimiter ":" splits
|
||||
# data into a convenient format of "colname": "value".
|
||||
# this script formats the data and outputs a df with each row
|
||||
# as a mutation and its corresponding mcsm_values
|
||||
|
||||
# Requirements:
|
||||
# input: output of step3a, file containing "..._output_processed.txt"
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
# output: formatted .csv file
|
||||
# path: "Data/<drug>/input/processed/<filename>"
|
||||
#***********************************************************************
|
||||
############# specify variables for input and output paths and filenames
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
|
||||
# input
|
||||
inpath = "/processed"
|
||||
in_filename = "/complex1_output_processed.txt"
|
||||
infile = homedir + basedir + inpath + in_filename
|
||||
print("Input file is:", infile)
|
||||
|
||||
# output
|
||||
outpath = "/processed"
|
||||
out_filename = "/complex1_formatted_results.csv"
|
||||
outfile = homedir + basedir + outpath + out_filename
|
||||
print("Output file is:", outfile)
|
||||
################## end of variable assignment for input and output files
|
||||
|
||||
outCols=[
|
||||
'PredictedAffinityChange',
|
||||
'Mutationinformation',
|
||||
'Wild-type',
|
||||
'Position',
|
||||
'Mutant-type',
|
||||
'Chain',
|
||||
'LigandID',
|
||||
'Distancetoligand',
|
||||
'DUETstabilitychange'
|
||||
]
|
||||
|
||||
lines = [line.rstrip('\n') for line in open(infile)]
|
||||
|
||||
outputs = defaultdict(list)
|
||||
|
||||
for item in lines:
|
||||
col, val = item.split(':')
|
||||
outputs[col].append(val)
|
||||
|
||||
dfOut=pd.DataFrame(outputs)
|
||||
|
||||
pd.DataFrame.to_csv(dfOut, outfile, columns=outCols)
|
|
@ -1,230 +0,0 @@
|
|||
getwd()
|
||||
#setwd("~/git/LSHTM_analysis/mcsm_complex1/Results")
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
# TASK: read formatted_results_df.csv to complete
|
||||
# missing info, adding DUET categories, assigning
|
||||
# meaningful colnames, etc.
|
||||
|
||||
# Requirements:
|
||||
# input: output of step3b, python processing,
|
||||
# path: Data/<drug>/input/processed/<filename>"
|
||||
# output: NO output as the next scripts refers to this
|
||||
# for yet more processing
|
||||
#=======================================================
|
||||
|
||||
# specify variables for input and output paths and filenames
|
||||
homedir = "~"
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
inpath = "/processed"
|
||||
in_filename = "/complex1_formatted_results.csv"
|
||||
infile = paste0(homedir, basedir, inpath, in_filename)
|
||||
print(paste0("Input file is:", infile))
|
||||
|
||||
#======================================================
|
||||
#TASK: To tidy the columns so you can generate figures
|
||||
#=======================================================
|
||||
####################
|
||||
#### read file #####: this will be the output from python script (csv file)
|
||||
####################
|
||||
data = read.csv(infile
|
||||
, header = T
|
||||
, stringsAsFactors = FALSE)
|
||||
dim(data)
|
||||
str(data)
|
||||
|
||||
# clear variables
|
||||
rm(homedir, basedir, inpath, in_filename, infile)
|
||||
|
||||
###########################
|
||||
##### Data processing #####
|
||||
###########################
|
||||
|
||||
# populate mutation information columns as currently it is empty
|
||||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
|
||||
# should not be blank: create muation information
|
||||
data$Mutationinformation = paste0(data$Wild.type, data$Position, data$Mutant.type)
|
||||
|
||||
head(data$Mutationinformation)
|
||||
tail(data$Mutationinformation)
|
||||
#write.csv(data, 'test.csv')
|
||||
|
||||
##########################################
|
||||
# Remove duplicate SNPs as a sanity check
|
||||
##########################################
|
||||
# very important
|
||||
table(duplicated(data$Mutationinformation))
|
||||
|
||||
# extract duplicated entries
|
||||
dups = data[duplicated(data$Mutationinformation),] #0
|
||||
|
||||
# No of dups should match with the no. of TRUE in the above table
|
||||
#u_dups = unique(dups$Mutationinformation) #10
|
||||
sum( table(dups$Mutationinformation) )
|
||||
|
||||
#***************************************************************
|
||||
# select non-duplicated SNPs and create a new df
|
||||
df = data[!duplicated(data$Mutationinformation),]
|
||||
#***************************************************************
|
||||
# sanity check
|
||||
u = unique(df$Mutationinformation)
|
||||
u2 = unique(data$Mutationinformation)
|
||||
table(u%in%u2)
|
||||
|
||||
# should all be 1
|
||||
sum(table(df$Mutationinformation) == 1)
|
||||
|
||||
# sort df by Position
|
||||
# MANUAL CHECKPOINT:
|
||||
#foo <- df[order(df$Position),]
|
||||
#df <- df[order(df$Position),]
|
||||
|
||||
# clear variables
|
||||
rm(u, u2, dups)
|
||||
|
||||
####################
|
||||
#### give meaningful colnames to reflect units to enable correct data type
|
||||
####################
|
||||
|
||||
#=======
|
||||
#STEP 1
|
||||
#========
|
||||
# make a copy of the PredictedAffinityColumn and call it Lig_outcome
|
||||
df$Lig_outcome = df$PredictedAffinityChange
|
||||
|
||||
#make Predicted...column numeric and outcome column categorical
|
||||
head(df$PredictedAffinityChange)
|
||||
df$PredictedAffinityChange = gsub("log.*"
|
||||
, ""
|
||||
, df$PredictedAffinityChange)
|
||||
|
||||
# sanity checks
|
||||
head(df$PredictedAffinityChange)
|
||||
|
||||
# should be numeric, check and if not make it numeric
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
|
||||
# change to numeric
|
||||
df$PredictedAffinityChange = as.numeric(df$PredictedAffinityChange)
|
||||
|
||||
# should be TRUE
|
||||
is.numeric( df$PredictedAffinityChange )
|
||||
|
||||
# change the column name to indicate units
|
||||
n = which(colnames(df) == "PredictedAffinityChange"); n
|
||||
colnames(df)[n] = "PredAffLog"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 2
|
||||
#========
|
||||
# make Lig_outcome column categorical showing effect of mutation
|
||||
head(df$Lig_outcome)
|
||||
df$Lig_outcome = gsub("^.*-"
|
||||
, "",
|
||||
df$Lig_outcome)
|
||||
# sanity checks
|
||||
head(df$Lig_outcome)
|
||||
|
||||
# should be factor, check and if not change it to factor
|
||||
is.factor(df$Lig_outcome)
|
||||
|
||||
# change to factor
|
||||
df$Lig_outcome = as.factor(df$Lig_outcome)
|
||||
|
||||
# should be TRUE
|
||||
is.factor(df$Lig_outcome)
|
||||
|
||||
#========
|
||||
#STEP 3
|
||||
#========
|
||||
# gsub
|
||||
head(df$Distancetoligand)
|
||||
df$Distancetoligand = gsub("Å"
|
||||
, ""
|
||||
, df$Distancetoligand)
|
||||
# sanity checks
|
||||
head(df$Distancetoligand)
|
||||
|
||||
# should be numeric, check if not change it to numeric
|
||||
is.numeric(df$Distancetoligand)
|
||||
|
||||
# change to numeric
|
||||
df$Distancetoligand = as.numeric(df$Distancetoligand)
|
||||
|
||||
# should be TRUE
|
||||
is.numeric(df$Distancetoligand)
|
||||
|
||||
# change the column name to indicate units
|
||||
n = which(colnames(df) == "Distancetoligand")
|
||||
colnames(df)[n] <- "Dis_lig_Ang"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 4
|
||||
#========
|
||||
#gsub
|
||||
head(df$DUETstabilitychange)
|
||||
df$DUETstabilitychange = gsub("Kcal/mol"
|
||||
, ""
|
||||
, df$DUETstabilitychange)
|
||||
# sanity checks
|
||||
head(df$DUETstabilitychange)
|
||||
|
||||
# should be numeric, check if not change it to numeric
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
|
||||
# change to numeric
|
||||
df$DUETstabilitychange = as.numeric(df$DUETstabilitychange)
|
||||
|
||||
# should be TRUE
|
||||
is.numeric(df$DUETstabilitychange)
|
||||
|
||||
# change the column name to indicate units
|
||||
n = which(colnames(df) == "DUETstabilitychange"); n
|
||||
colnames(df)[n] = "DUETStability_Kcalpermol"
|
||||
colnames(df)[n]
|
||||
|
||||
#========
|
||||
#STEP 5
|
||||
#========
|
||||
# create yet another extra column: classification of DUET stability only
|
||||
df$DUET_outcome = ifelse(df$DUETStability_Kcalpermol >=0
|
||||
, "Stabilizing"
|
||||
, "Destabilizing") # spelling to be consistent with mcsm
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
#==============================
|
||||
#FIXME
|
||||
#Insert a venn diagram
|
||||
#================================
|
||||
|
||||
#========
|
||||
#STEP 6
|
||||
#========
|
||||
# assign wild and mutant colnames correctly
|
||||
|
||||
wt = which(colnames(df) == "Wild.type"); wt
|
||||
colnames(df)[wt] <- "Wild_type"
|
||||
colnames(df[wt])
|
||||
|
||||
mut = which(colnames(df) == "Mutant.type"); mut
|
||||
colnames(df)[mut] <- "Mutant_type"
|
||||
colnames(df[mut])
|
||||
|
||||
#========
|
||||
#STEP 7
|
||||
#========
|
||||
# create an extra column: maybe useful for some plots
|
||||
df$WildPos = paste0(df$Wild_type, df$Position)
|
||||
|
||||
# clear variables
|
||||
rm(n, wt, mut)
|
||||
|
||||
################ end of data cleaning
|
|
@ -1,275 +0,0 @@
|
|||
##################
|
||||
# load libraries
|
||||
library(compare)
|
||||
##################
|
||||
|
||||
getwd()
|
||||
|
||||
#=======================================================
|
||||
# TASK:read cleaned data and perform rescaling
|
||||
# of DUET stability scores
|
||||
# of Pred affinity
|
||||
# compare scaling methods with plots
|
||||
|
||||
# Requirements:
|
||||
# input: R script, step3c_results_cleaning.R
|
||||
# path: Data/<drug>/input/processed/<filename>"
|
||||
# output: NO output as the next scripts refers to this
|
||||
# for yet more processing
|
||||
# output normalised file
|
||||
#=======================================================
|
||||
|
||||
# specify variables for input and output paths and filenames
|
||||
homedir = "~"
|
||||
currdir = "/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/mcsm"
|
||||
in_filename = "/step3c_results_cleaning.R"
|
||||
infile = paste0(homedir, currdir, in_filename)
|
||||
print(paste0("Input file is:", infile))
|
||||
|
||||
# output file
|
||||
basedir = "/git/Data/pyrazinamide/input"
|
||||
outpath = "/processed"
|
||||
out_filename = "/mcsm_complex1_normalised.csv"
|
||||
outfile = paste0(homedir, basedir, outpath, out_filename)
|
||||
print(paste0("Output file is:", outfile))
|
||||
|
||||
####################
|
||||
#### read file #####: this will be the output of my R script that cleans the data columns
|
||||
####################
|
||||
source(infile)
|
||||
|
||||
#This will outut two dataframes:
|
||||
# data: unclean data: 10 cols
|
||||
# df : cleaned df: 13 cols
|
||||
# you can remove data if you want as you will not need it
|
||||
rm(data)
|
||||
|
||||
colnames(df)
|
||||
|
||||
#===================
|
||||
#3a: PredAffLog
|
||||
#===================
|
||||
n = which(colnames(df) == "PredAffLog"); n
|
||||
group = which(colnames(df) == "Lig_outcome"); group
|
||||
|
||||
#===================================================
|
||||
# order according to PredAffLog values
|
||||
#===================================================
|
||||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$PredAffLog)
|
||||
|
||||
# ORDER BY PredAff scrores: negative values at the top and positive at the bottoom
|
||||
df = df[order(df$PredAffLog),]
|
||||
head(df$PredAffLog)
|
||||
|
||||
# sanity checks
|
||||
head(df[,n]) # all negatives
|
||||
tail(df[,n]) # all positives
|
||||
|
||||
# sanity checks
|
||||
mean(df[,n])
|
||||
#-0.9526746
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
|
||||
#===========================
|
||||
# Same as above: in 2 steps
|
||||
#===========================
|
||||
|
||||
# find range of your data
|
||||
my_min = min(df[,n]); my_min #
|
||||
my_max = max(df[,n]); my_max #
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one: preserves sign
|
||||
#===============================================
|
||||
df$ratioPredAff = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
)# 14 cols
|
||||
# sanity checks
|
||||
head(df$ratioPredAff)
|
||||
tail(df$ratioPredAff)
|
||||
|
||||
min(df$ratioPredAff); max(df$ratioPredAff)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
# should be the same as below
|
||||
sum(df$ratioPredAff < 0); sum(df$ratioPredAff > 0)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
# uncomment as necessary
|
||||
my_title = "Ligand_stability"
|
||||
# my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(2,2))
|
||||
|
||||
hist(df[,n]
|
||||
, xlab = ""
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
hist(df$ratioPredAff
|
||||
, xlab = ""
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# Plot density plots underneath
|
||||
plot(density( df[,n] )
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
plot(density( df$ratioPredAff )
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = my_title
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
|
||||
#clear variables
|
||||
rm(my_min, my_max, my_title, n, group)
|
||||
|
||||
#===================
|
||||
# 3b: DUET stability
|
||||
#===================
|
||||
dim(df) # 14 cols
|
||||
|
||||
n = which(colnames(df) == "DUETStability_Kcalpermol"); n # 10
|
||||
group = which(colnames(df) == "DUET_outcome"); group #12
|
||||
|
||||
#===================================================
|
||||
# order according to DUET scores
|
||||
#===================================================
|
||||
# This is because this makes it easier to see the results of rescaling for debugging
|
||||
head(df$DUETStability_Kcalpermol)
|
||||
|
||||
# ORDER BY DUET scores: negative values at the top and positive at the bottom
|
||||
df = df[order(df$DUETStability_Kcalpermol),]
|
||||
|
||||
# sanity checks
|
||||
head(df[,n]) # negatives
|
||||
tail(df[,n]) # positives
|
||||
|
||||
# sanity checks
|
||||
mean(df[,n])
|
||||
|
||||
tapply(df[,n], df[,group], mean)
|
||||
|
||||
#===============================================
|
||||
# WITHIN GROUP rescaling 2: method "ratio"
|
||||
# create column to store the rescaled values
|
||||
# Rescaling separately (Less dangerous)
|
||||
# =====> chosen one: preserves sign
|
||||
#===============================================
|
||||
# find range of your data
|
||||
my_min = min(df[,n]); my_min
|
||||
my_max = max(df[,n]); my_max
|
||||
|
||||
df$ratioDUET = ifelse(df[,n] < 0
|
||||
, df[,n]/abs(my_min)
|
||||
, df[,n]/my_max
|
||||
) # 15 cols
|
||||
# sanity check
|
||||
head(df$ratioDUET)
|
||||
tail(df$ratioDUET)
|
||||
|
||||
min(df$ratioDUET); max(df$ratioDUET)
|
||||
|
||||
# sanity checks
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
# should be the same as below (267 and 42)
|
||||
sum(df$ratioDUET < 0); sum(df$ratioDUET > 0)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
#===============================================
|
||||
# Hist and density plots to compare the rescaling
|
||||
# methods: Base R
|
||||
#===============================================
|
||||
# uncomment as necessary
|
||||
my_title = "DUET_stability"
|
||||
#my_title = colnames(df[n])
|
||||
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(2,2))
|
||||
|
||||
hist(df[,n]
|
||||
, xlab = ""
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
hist(df$ratioDUET
|
||||
, xlab = ""
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# Plot density plots underneath
|
||||
plot(density( df[,n] )
|
||||
, main = "Raw values"
|
||||
)
|
||||
|
||||
plot(density( df$ratioDUET )
|
||||
, main = "ratio rescaling"
|
||||
)
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = my_title
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
# reorder by column name
|
||||
#data <- data[c("A", "B", "C")]
|
||||
colnames(df)
|
||||
df2 = df[c("X", "Mutationinformation", "WildPos", "Position"
|
||||
, "Wild_type", "Mutant_type"
|
||||
, "DUETStability_Kcalpermol", "DUET_outcome"
|
||||
, "Dis_lig_Ang", "PredAffLog", "Lig_outcome"
|
||||
, "ratioDUET", "ratioPredAff"
|
||||
, "LigandID","Chain")]
|
||||
|
||||
# sanity check
|
||||
# should be True
|
||||
#compare(df, df2, allowAll = T)
|
||||
compare(df, df2, ignoreColOrder = T)
|
||||
#TRUE
|
||||
#reordered columns
|
||||
|
||||
#===================
|
||||
# write output as csv file
|
||||
#===================
|
||||
#write.csv(df, "../Data/mcsm_complex1_normalised.csv", row.names = FALSE)
|
||||
write.csv(df2, outfile, row.names = FALSE)
|
|
@ -1,131 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(data.table)
|
||||
require(dplyr)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
###########################
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
###########################
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
###########################
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
###########################
|
||||
# Data for bfactor figure
|
||||
# PS average
|
||||
# Lig average
|
||||
###########################
|
||||
|
||||
head(my_df$Position)
|
||||
head(my_df$ratioDUET)
|
||||
|
||||
# order data frame
|
||||
df = my_df[order(my_df$Position),]
|
||||
|
||||
head(df$Position)
|
||||
head(df$ratioDUET)
|
||||
|
||||
#***********
|
||||
# PS: average by position
|
||||
#***********
|
||||
|
||||
mean_DUET_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.DUET = mean(ratioDUET))
|
||||
|
||||
#***********
|
||||
# Lig: average by position
|
||||
#***********
|
||||
mean_Lig_by_position <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(averaged.Lig = mean(ratioPredAff))
|
||||
|
||||
|
||||
#***********
|
||||
# cbind:mean_DUET_by_position and mean_Lig_by_position
|
||||
#***********
|
||||
|
||||
combined = as.data.frame(cbind(mean_DUET_by_position, mean_Lig_by_position ))
|
||||
|
||||
# sanity check
|
||||
# mean_PS_Lig_Bfactor
|
||||
|
||||
colnames(combined)
|
||||
|
||||
colnames(combined) = c("Position"
|
||||
, "average_DUETR"
|
||||
, "Position2"
|
||||
, "average_PredAffR")
|
||||
|
||||
colnames(combined)
|
||||
|
||||
identical(combined$Position, combined$Position2)
|
||||
|
||||
n = which(colnames(combined) == "Position2"); n
|
||||
|
||||
combined_df = combined[,-n]
|
||||
|
||||
max(combined_df$average_DUETR) ; min(combined_df$average_DUETR)
|
||||
|
||||
max(combined_df$average_PredAffR) ; min(combined_df$average_PredAffR)
|
||||
|
||||
#=============
|
||||
# output csv
|
||||
#============
|
||||
outDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
outFile = paste0(outDir, "mean_PS_Lig_Bfactor.csv")
|
||||
print(paste0("Output file with path will be:","", outFile))
|
||||
|
||||
head(combined_df$Position); tail(combined_df$Position)
|
||||
|
||||
write.csv(combined_df, outFile
|
||||
, row.names = F)
|
Binary file not shown.
|
@ -1,250 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
require(cowplot)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#my_df = merged_df3
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# sanity check
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.numeric(my_df$OR)
|
||||
#[1] TRUE
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
# FOR PS Plots
|
||||
#<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
PS_df = my_df
|
||||
|
||||
rm(my_df)
|
||||
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
getwd()
|
||||
|
||||
source("combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for OR and stability plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df2 = merged_df3_comp
|
||||
#my_df2 = merged_df3
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df2)
|
||||
str(my_df2)
|
||||
|
||||
# sanity check
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.numeric(my_df2$OR)
|
||||
#[1] TRUE
|
||||
|
||||
# sanity check: should be <10
|
||||
if (max(my_df2$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
# FOR Lig Plots
|
||||
#<<<<<<<<<<<<<<<<
|
||||
|
||||
Lig_df = my_df2
|
||||
|
||||
rm(my_df2)
|
||||
|
||||
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> end of section 1
|
||||
|
||||
#############
|
||||
# Plots: Bubble plot
|
||||
# x = Position, Y = stability
|
||||
# size of dots = OR
|
||||
# col: stability
|
||||
#############
|
||||
|
||||
#=================
|
||||
# generate plot 1: DUET vs OR by position as geom_points
|
||||
#=================
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#PS_df$DUET_outcome[PS_df$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(PS_df$DUET_outcome) ; sum(table(PS_df$DUET_outcome))
|
||||
|
||||
g = ggplot(PS_df, aes(x = factor(Position)
|
||||
, y = ratioDUET))
|
||||
|
||||
p1 = g +
|
||||
geom_point(aes(col = DUET_outcome
|
||||
, size = OR)) +
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "DUET(PS)"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "DUET Outcome") +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p1
|
||||
|
||||
#=================
|
||||
# generate plot 2: Lig vs OR by position as geom_points
|
||||
#=================
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# Spelling Correction: made redundant as already corrected at the source
|
||||
|
||||
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
||||
#Lig_df$Lig_outcome[Lig_df$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
||||
|
||||
table(Lig_df$Lig_outcome)
|
||||
|
||||
g = ggplot(Lig_df, aes(x = factor(Position)
|
||||
, y = ratioPredAff))
|
||||
|
||||
p2 = g +
|
||||
geom_point(aes(col = Lig_outcome
|
||||
, size = OR))+
|
||||
theme(axis.text.x = element_text(size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, legend.text = element_text(size = my_als)
|
||||
, legend.title = element_text(size = my_als) ) +
|
||||
#, legend.key.size = unit(1, "cm")) +
|
||||
labs(title = ""
|
||||
, x = "Position"
|
||||
, y = "Ligand Affinity"
|
||||
, size = "Odds Ratio"
|
||||
, colour = "Ligand Outcome"
|
||||
) +
|
||||
guides(colour = guide_legend(override.aes = list(size=4)))
|
||||
|
||||
p2
|
||||
|
||||
#======================
|
||||
#combine using cowplot
|
||||
#======================
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('PS_Lig_OR_combined.svg', width = 32, height = 12) #inches
|
||||
#png('PS_Lig_OR_combined.png', width = 2800, height = 1080) #300dpi
|
||||
theme_set(theme_gray()) # to preserve default theme
|
||||
|
||||
printFile = cowplot::plot_grid(plot_grid(p1, p2
|
||||
, ncol = 1
|
||||
, align = 'v'
|
||||
, labels = c("A", "B")
|
||||
, label_size = my_als+5))
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Barplot with scores (unordered)
|
||||
# corresponds to Lig_outcome
|
||||
# Stacked Barplot with colours: Lig_outcome @ position coloured by
|
||||
# Lig_outcome. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding Lig_outcome.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(my_df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(df$Lig_outcome)
|
||||
#TRUE
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# should be -1 and 1: may not be in this case because you have filtered the data
|
||||
# FIXME: normalisation before or after filtering?
|
||||
min(df$ratioPredAff) #
|
||||
max(df$ratioPredAff) #
|
||||
|
||||
# sanity checks
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
my_title = "Ligand affinity"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = Lig_outcome), colour = "grey") +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -1,149 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot 2: Barplot with scores (unordered)
|
||||
# corresponds to DUET_outcome
|
||||
# Stacked Barplot with colours: DUET_outcome @ position coloured by
|
||||
# DUET outcome. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding DUET_outcome
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
table(my_df$DUET_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$ratioDUET)
|
||||
max(df$ratioDUET)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
my_title = "Protein stability (DUET)"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = DUET_outcome), colour = "grey") +
|
||||
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -1,202 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
source("../barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$Lig_outcome)
|
||||
my_df$Lig_outcome = as.factor(my_df$Ligoutcome)
|
||||
is.factor(my_df$Lig_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Barplot with scores (unordered)
|
||||
# corresponds to Lig_outcome
|
||||
# Stacked Barplot with colours: Lig_outcome @ position coloured by
|
||||
# stability scores. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding Lig stability value.
|
||||
# Normalised values (range between -1 and 1 ) to aid visualisation
|
||||
# NOTE: since barplot plots discrete values, colour = score, so number of
|
||||
# colours will be equal to the no. of unique normalised scores
|
||||
# rather than a continuous scale
|
||||
# will require generating the colour scale separately.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# should be -1 and 1: may not be in this case because you have filtered the data
|
||||
# FIXME: normalisation before or after filtering?
|
||||
min(df$ratioPredAff) #
|
||||
max(df$ratioPredAff) #
|
||||
|
||||
# sanity checks
|
||||
# very important!!!!
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, min)
|
||||
|
||||
tapply(df$ratioPredAff, df$Lig_outcome, max)
|
||||
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = Lig_outcome
|
||||
# subgroup = normalised score i.e ratioPredAff
|
||||
|
||||
# Prepare data: round off ratioLig scores
|
||||
# round off to 3 significant digits:
|
||||
# 165 if no rounding is performed: used to generate the originalgraph
|
||||
# 156 if rounded to 3 places
|
||||
# FIXME: check if reducing precision creates any ML prob
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$ratioPredAff)
|
||||
|
||||
# <<<<< -------------------------------------------
|
||||
# Run this section if rounding is to be used
|
||||
# specify number for rounding
|
||||
n = 3
|
||||
df$ratioLigR = round(df$ratioPredAff, n)
|
||||
u = unique(df$ratioLigR) # 156
|
||||
# create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$ratioLigR
|
||||
df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# else
|
||||
# uncomment the below if rounding is not required
|
||||
|
||||
#my_grp = df$ratioLig
|
||||
#df$group <- paste0(df$Lig_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# <<<<< -----------------------------------------------
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "Lig_outcome", "my_grp")
|
||||
my_title = "Ligand affinity"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual( values = colours
|
||||
, guide = 'none') +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -1,192 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
source("../barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Barplot with scores (unordered)
|
||||
# corresponds to DUET_outcome
|
||||
# Stacked Barplot with colours: DUET_outcome @ position coloured by
|
||||
# stability scores. This is a barplot where each bar corresponds
|
||||
# to a SNP and is coloured by its corresponding DUET stability value.
|
||||
# Normalised values (range between -1 and 1 ) to aid visualisation
|
||||
# NOTE: since barplot plots discrete values, colour = score, so number of
|
||||
# colours will be equal to the no. of unique normalised scores
|
||||
# rather than a continuous scale
|
||||
# will require generating the colour scale separately.
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
upos = unique(df$Position)
|
||||
|
||||
# should be a factor
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
# should be -1 and 1
|
||||
min(df$ratioDUET)
|
||||
max(df$ratioDUET)
|
||||
|
||||
tapply(df$ratioDUET, df$DUET_outcome, min)
|
||||
tapply(df$ratioDUET, df$DUET_outcome, max)
|
||||
|
||||
#******************
|
||||
# generate plot
|
||||
#******************
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
# My colour FUNCTION: based on group and subgroup
|
||||
# in my case;
|
||||
# df = df
|
||||
# group = DUET_outcome
|
||||
# subgroup = normalised score i.e ratioDUET
|
||||
|
||||
# Prepare data: round off ratioDUET scores
|
||||
# round off to 3 significant digits:
|
||||
# 323 if no rounding is performed: used to generate the original graph
|
||||
# 287 if rounded to 3 places
|
||||
# FIXME: check if reducing precicion creates any ML prob
|
||||
|
||||
# check unique values in normalised data
|
||||
u = unique(df$ratioDUET)
|
||||
|
||||
# <<<<< -------------------------------------------
|
||||
# Run this section if rounding is to be used
|
||||
# specify number for rounding
|
||||
n = 3
|
||||
df$ratioDUETR = round(df$ratioDUET, n)
|
||||
u = unique(df$ratioDUETR)
|
||||
# create an extra column called group which contains the "gp name and score"
|
||||
# so colours can be generated for each unique values in this column
|
||||
my_grp = df$ratioDUETR
|
||||
df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# else
|
||||
# uncomment the below if rounding is not required
|
||||
|
||||
#my_grp = df$ratioDUET
|
||||
#df$group <- paste0(df$DUET_outcome, "_", my_grp, sep = "")
|
||||
|
||||
# <<<<< -----------------------------------------------
|
||||
|
||||
# Call the function to create the palette based on the group defined above
|
||||
colours <- ColourPalleteMulti(df, "DUET_outcome", "my_grp")
|
||||
my_title = "Protein stability (DUET)"
|
||||
|
||||
# axis label size
|
||||
my_xaxls = 13
|
||||
my_yaxls = 15
|
||||
|
||||
# axes text size
|
||||
my_xaxts = 15
|
||||
my_yaxts = 15
|
||||
|
||||
# no ordering of x-axis
|
||||
g = ggplot(df, aes(factor(Position, ordered = T)))
|
||||
g +
|
||||
geom_bar(aes(fill = group), colour = "grey") +
|
||||
scale_fill_manual( values = colours
|
||||
, guide = 'none') +
|
||||
theme( axis.text.x = element_text(size = my_xaxls
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
, axis.text.y = element_text(size = my_yaxls
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(size = my_xaxts)
|
||||
, axis.title.y = element_text(size = my_yaxts ) ) +
|
||||
labs(title = my_title
|
||||
, x = "Position"
|
||||
, y = "Frequency")
|
||||
|
||||
# for sanity and good practice
|
||||
rm(df)
|
||||
#======================= end of plot
|
||||
# axis colours labels
|
||||
# https://stackoverflow.com/questions/38862303/customize-ggplot2-axis-labels-with-different-colors
|
||||
# https://stackoverflow.com/questions/56543485/plot-coloured-boxes-around-axis-label
|
|
@ -1,215 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#require(data.table)
|
||||
#require(dplyr)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$Lig_outcome)
|
||||
my_df$Lig_outcome = as.factor(my_df$lig_outcome)
|
||||
is.factor(my_df$Lig_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Basic barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
if (identical(df$Position, df$position)){
|
||||
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
||||
} else{
|
||||
print("Error!: Check column names and info contained")
|
||||
}
|
||||
|
||||
#****************
|
||||
# generate plot: No of stabilising and destabilsing muts
|
||||
#****************
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('basic_barplots_LIG.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
# uncomment as necessary for either directly outputting results or
|
||||
# printing on the screen
|
||||
g = ggplot(df, aes(x = Lig_outcome))
|
||||
#prinfFile = g + geom_bar(
|
||||
g + geom_bar(
|
||||
aes(fill = Lig_outcome)
|
||||
, show.legend = TRUE
|
||||
) + geom_label(
|
||||
stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) + theme(
|
||||
axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size=my_als)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.text = element_text(size=my_als-2)
|
||||
, legend.title = element_text(size=my_als)
|
||||
, plot.title = element_blank()
|
||||
) + labs(
|
||||
title = ""
|
||||
, y = "Number of SNPs"
|
||||
#, fill='Ligand Outcome'
|
||||
) + scale_fill_discrete(name = "Ligand Outcome"
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
|
||||
#****************
|
||||
# generate plot: No of positions
|
||||
#****************
|
||||
#get freq count of positions so you can subset freq<1
|
||||
#require(data.table)
|
||||
setDT(df)[, pos_count := .N, by = .(Position)] #169, 36
|
||||
|
||||
head(df$pos_count)
|
||||
table(df$pos_count)
|
||||
# this is cummulative
|
||||
#1 2 3 4 5 6
|
||||
#5 24 36 56 30 18
|
||||
|
||||
# use group by on this
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
#1 2 3 4 5 6
|
||||
#5 12 12 14 6 3
|
||||
# this is what will get plotted
|
||||
|
||||
svg('position_count_LIG.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes (alpha = 0.5)
|
||||
, show.legend = FALSE
|
||||
) +
|
||||
geom_label(
|
||||
stat = "count", aes(label = ..count..)
|
||||
, color = "black"
|
||||
, size = 10
|
||||
) +
|
||||
theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
)
|
||||
, axis.text.y = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, plot.title = element_blank()
|
||||
) +
|
||||
labs(
|
||||
x = "Number of SNPs"
|
||||
, y = "Number of Sites"
|
||||
)
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
########################################################################
|
||||
# end of Lig barplots #
|
||||
########################################################################
|
||||
|
||||
|
|
@ -1,211 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for DUET plots
|
||||
# you need merged_df3
|
||||
# or
|
||||
# merged_df3_comp
|
||||
# since these have unique SNPs
|
||||
# I prefer to use the merged_df3
|
||||
# because using the _comp dataset means
|
||||
# we lose some muts and at this level, we should use
|
||||
# as much info as available
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
# sanity check
|
||||
is.factor(my_df$DUET_outcome)
|
||||
my_df$DUET_outcome = as.factor(my_df$DUET_outcome)
|
||||
is.factor(my_df$DUET_outcome)
|
||||
#[1] TRUE
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Basic barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
if (identical(df$Position, df$position)){
|
||||
print("Sanity check passed: Columns 'Position' and 'position' are identical")
|
||||
} else{
|
||||
print("Error!: Check column names and info contained")
|
||||
}
|
||||
|
||||
#****************
|
||||
# generate plot: No of stabilising and destabilsing muts
|
||||
#****************
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('basic_barplots_DUET.svg')
|
||||
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
theme_set(theme_grey())
|
||||
|
||||
# uncomment as necessary for either directly outputting results or
|
||||
# printing on the screen
|
||||
g = ggplot(df, aes(x = DUET_outcome))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes(fill = DUET_outcome)
|
||||
, show.legend = TRUE
|
||||
) + geom_label(
|
||||
stat = "count"
|
||||
, aes(label = ..count..)
|
||||
, color = "black"
|
||||
, show.legend = FALSE
|
||||
, size = 10) + theme(
|
||||
axis.text.x = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_text(size=my_als)
|
||||
, axis.text.y = element_text(size = my_ats)
|
||||
, legend.position = c(0.73,0.8)
|
||||
, legend.text = element_text(size=my_als-2)
|
||||
, legend.title = element_text(size=my_als)
|
||||
, plot.title = element_blank()
|
||||
) + labs(
|
||||
title = ""
|
||||
, y = "Number of SNPs"
|
||||
#, fill='DUET Outcome'
|
||||
) + scale_fill_discrete(name = "DUET Outcome"
|
||||
, labels = c("Destabilising", "Stabilising"))
|
||||
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
|
||||
#****************
|
||||
# generate plot: No of positions
|
||||
#****************
|
||||
#get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, .(Freq := .N), by = .(Position)] #189, 36
|
||||
|
||||
setDT(df)[, pos_count := .N, by = .(Position)] #335, 36
|
||||
table(df$pos_count)
|
||||
# this is cummulative
|
||||
#1 2 3 4 5 6
|
||||
#34 76 63 104 40 18
|
||||
|
||||
# use group by on this
|
||||
snpsBYpos_df <- df %>%
|
||||
group_by(Position) %>%
|
||||
summarize(snpsBYpos = mean(pos_count))
|
||||
|
||||
table(snpsBYpos_df$snpsBYpos)
|
||||
#1 2 3 4 5 6
|
||||
#34 38 21 26 8 3
|
||||
|
||||
foo = select(df, Mutationinformation
|
||||
, WildPos
|
||||
, wild_type
|
||||
, mutant_type
|
||||
, mutation_info
|
||||
, position
|
||||
, pos_count) #335, 5
|
||||
|
||||
getwd()
|
||||
write.csv(foo, "../Data/pos_count_freq.csv")
|
||||
|
||||
svg('position_count_DUET.svg')
|
||||
my_ats = 25 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(snpsBYpos_df, aes(x = snpsBYpos))
|
||||
prinfFile = g + geom_bar(
|
||||
#g + geom_bar(
|
||||
aes (alpha = 0.5)
|
||||
, show.legend = FALSE
|
||||
) +
|
||||
geom_label(
|
||||
stat = "count", aes(label = ..count..)
|
||||
, color = "black"
|
||||
, size = 10
|
||||
) +
|
||||
theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
)
|
||||
, axis.text.y = element_text(
|
||||
size = my_ats
|
||||
, angle = 0
|
||||
, hjust = 1
|
||||
)
|
||||
, axis.title.x = element_text(size = my_als)
|
||||
, axis.title.y = element_text(size = my_als)
|
||||
, plot.title = element_blank()
|
||||
) +
|
||||
labs(
|
||||
x = "Number of SNPs"
|
||||
, y = "Number of Sites"
|
||||
)
|
||||
print(prinfFile)
|
||||
dev.off()
|
||||
########################################################################
|
||||
# end of DUET barplots #
|
||||
########################################################################
|
||||
|
|
@ -1,175 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages and functions #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for PS Corr plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Correlation plots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
table(df$DUET_outcome)
|
||||
|
||||
# unique positions
|
||||
length(unique(df$Position)) #{RESULT: unique positions for comp data}
|
||||
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
corr_data = df[, c("ratioDUET"
|
||||
# , "ratioPredAff"
|
||||
# , "DUETStability_Kcalpermol"
|
||||
# , "PredAffLog"
|
||||
# , "OR"
|
||||
, "logor"
|
||||
# , "pvalue"
|
||||
, "neglog10pvalue"
|
||||
, "AF"
|
||||
, "DUET_outcome"
|
||||
# , "Lig_outcome"
|
||||
, "pyrazinamide"
|
||||
)]
|
||||
dim(corr_data)
|
||||
rm(df)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c("DUET"
|
||||
# , "Ligand Affinity"
|
||||
# , "DUET_raw"
|
||||
# , "Lig_raw"
|
||||
# , "OR"
|
||||
, "Log(Odds Ratio)"
|
||||
# , "P-value"
|
||||
, "-LogP"
|
||||
, "Allele Frequency"
|
||||
, "DUET_outcome"
|
||||
# , "Lig_outcome"
|
||||
, "pyrazinamide")
|
||||
|
||||
# sanity check
|
||||
if (length(my_corr_colnames) == length(corr_data)){
|
||||
print("Sanity check passed: corr_data and corr_names match in length")
|
||||
}else{
|
||||
print("Error: length mismatch!")
|
||||
}
|
||||
|
||||
colnames(corr_data)
|
||||
colnames(corr_data) <- my_corr_colnames
|
||||
colnames(corr_data)
|
||||
|
||||
###############
|
||||
# PLOTS: corr
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
###############
|
||||
#default pairs plot
|
||||
start = 1
|
||||
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr = corr_data[start:(end-offset)]
|
||||
head(my_corr)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
#==========
|
||||
# psych: ionformative since it draws the ellipsoid
|
||||
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
#==========
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('DUET_corr.svg', width = 15, height = 15)
|
||||
printFile = pairs.panels(my_corr[1:4]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$DUET_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
#, alpha = .05
|
||||
#, points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 3
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
|
@ -1,187 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for Lig Corr plots
|
||||
# you need merged_df3_comp
|
||||
# since these are matched
|
||||
# to allow pairwise corr
|
||||
###########################
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#===========================
|
||||
# Plot: Correlation plots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df = my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(my_df)
|
||||
|
||||
# sanity checks
|
||||
str(df)
|
||||
|
||||
table(df$Lig_outcome)
|
||||
|
||||
# unique positions
|
||||
length(unique(df$Position)) #{RESULT: unique positions for comp data}
|
||||
|
||||
# subset data to generate pairwise correlations
|
||||
corr_data = df[, c(#"ratioDUET",
|
||||
"ratioPredAff"
|
||||
# , "DUETStability_Kcalpermol"
|
||||
# , "PredAffLog"
|
||||
# , "OR"
|
||||
, "logor"
|
||||
# , "pvalue"
|
||||
, "neglog10pvalue"
|
||||
, "AF"
|
||||
# , "DUET_outcome"
|
||||
, "Lig_outcome"
|
||||
, "pyrazinamide"
|
||||
)]
|
||||
dim(corr_data)
|
||||
rm(df)
|
||||
|
||||
# assign nice colnames (for display)
|
||||
my_corr_colnames = c(#"DUET",
|
||||
"Ligand Affinity"
|
||||
# ,"DUET_raw"
|
||||
# , "Lig_raw"
|
||||
# , "OR"
|
||||
, "Log(Odds Ratio)"
|
||||
# , "P-value"
|
||||
, "-LogP"
|
||||
, "Allele Frequency"
|
||||
# , "DUET_outcome"
|
||||
, "Lig_outcome"
|
||||
, "pyrazinamide")
|
||||
|
||||
# sanity check
|
||||
if (length(my_corr_colnames) == length(corr_data)){
|
||||
print("Sanity check passed: corr_data and corr_names match in length")
|
||||
}else{
|
||||
print("Error: length mismatch!")
|
||||
}
|
||||
|
||||
colnames(corr_data)
|
||||
colnames(corr_data) <- my_corr_colnames
|
||||
colnames(corr_data)
|
||||
|
||||
###############
|
||||
# PLOTS: corr
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
###############
|
||||
|
||||
# default pairs plot
|
||||
start = 1
|
||||
end = which(colnames(corr_data) == "pyrazinamide"); end # should be the last column
|
||||
offset = 1
|
||||
|
||||
my_corr = corr_data[start:(end-offset)]
|
||||
head(my_corr)
|
||||
|
||||
#my_cols = c("#f8766d", "#00bfc4")
|
||||
# deep blue :#007d85
|
||||
# deep red: #ae301e
|
||||
|
||||
#==========
|
||||
# psych: ionformative since it draws the ellipsoid
|
||||
# https://jamesmarquezportfolio.com/correlation_matrices_in_r.html
|
||||
# http://www.sthda.com/english/wiki/scatter-plot-matrices-r-base-graphs
|
||||
#==========
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots"
|
||||
getwd()
|
||||
|
||||
svg('Lig_corr.svg', width = 15, height = 15)
|
||||
printFile = pairs.panels(my_corr[1:4]
|
||||
, method = "spearman" # correlation method
|
||||
, hist.col = "grey" ##00AFBB
|
||||
, density = TRUE # show density plots
|
||||
, ellipses = F # show correlation ellipses
|
||||
, stars = T
|
||||
, rug = F
|
||||
, breaks = "Sturges"
|
||||
, show.points = T
|
||||
, bg = c("#f8766d", "#00bfc4")[unclass(factor(my_corr$Lig_outcome))]
|
||||
, pch = 21
|
||||
, jitter = T
|
||||
# , alpha = .05
|
||||
# , points(pch = 19, col = c("#f8766d", "#00bfc4"))
|
||||
, cex = 3
|
||||
, cex.axis = 2.5
|
||||
, cex.labels = 3
|
||||
, cex.cor = 1
|
||||
, smooth = F
|
||||
)
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#==========================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2, comprehensive one
|
||||
# since this has one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage barplot
|
||||
# x = lineage y = No. of samples
|
||||
# col = Lineage
|
||||
# fill = lineage
|
||||
#============================
|
||||
table(my_df$lineage)
|
||||
|
||||
# lineage1 lineage2 lineage3 lineage4 lineage5 lineage6 lineageBOV
|
||||
#3 104 1293 264 1311 6 6 105
|
||||
|
||||
#===========================
|
||||
# Plot: Lineage Barplots
|
||||
#===========================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- my_df
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
rm(my_df)
|
||||
|
||||
# get freq count of positions so you can subset freq<1
|
||||
#setDT(df)[, lineage_count := .N, by = .(lineage)]
|
||||
|
||||
#******************
|
||||
# generate plot: barplot of mutation by lineage
|
||||
#******************
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
df_lin = subset(df, subset = lineage %in% sel_lineages )
|
||||
|
||||
#FIXME; add sanity check for numbers.
|
||||
# Done this manually
|
||||
|
||||
############################################################
|
||||
|
||||
#########
|
||||
# Data for barplot: Lineage barplot
|
||||
# to show total samples and number of unique mutations
|
||||
# within each linege
|
||||
##########
|
||||
|
||||
# Create df with lineage inform & no. of unique mutations
|
||||
# per lineage and total samples within lineage
|
||||
# this is essentially barplot with two y axis
|
||||
|
||||
bar = bar = as.data.frame(sel_lineages) #4, 1
|
||||
total_snps_u = NULL
|
||||
total_samples = NULL
|
||||
|
||||
for (i in sel_lineages){
|
||||
#print(i)
|
||||
curr_total = length(unique(df$id)[df$lineage==i])
|
||||
total_samples = c(total_samples, curr_total)
|
||||
print(total_samples)
|
||||
|
||||
foo = df[df$lineage==i,]
|
||||
print(paste0(i, "======="))
|
||||
print(length(unique(foo$Mutationinformation)))
|
||||
curr_count = length(unique(foo$Mutationinformation))
|
||||
|
||||
total_snps_u = c(total_snps_u, curr_count)
|
||||
}
|
||||
|
||||
print(total_snps_u)
|
||||
bar$num_snps_u = total_snps_u
|
||||
bar$total_samples = total_samples
|
||||
bar
|
||||
|
||||
#*****************
|
||||
# generate plot: lineage barplot with two y-axis
|
||||
#https://stackoverflow.com/questions/13035295/overlay-bar-graphs-in-ggplot2
|
||||
#*****************
|
||||
|
||||
bar$num_snps_u = y1
|
||||
bar$total_samples = y2
|
||||
sel_lineages = x
|
||||
|
||||
to_plot = data.frame(x = x
|
||||
, y1 = y1
|
||||
, y2 = y2)
|
||||
to_plot
|
||||
|
||||
melted = melt(to_plot, id = "x")
|
||||
melted
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_basic_barplot.svg')
|
||||
|
||||
my_ats = 20 # axis text size
|
||||
my_als = 22 # axis label size
|
||||
|
||||
g = ggplot(melted
|
||||
, aes(x = x
|
||||
, y = value
|
||||
, fill = variable)
|
||||
)
|
||||
|
||||
|
||||
printFile = g + geom_bar(
|
||||
|
||||
#g + geom_bar(
|
||||
stat = "identity"
|
||||
, position = position_stack(reverse = TRUE)
|
||||
, alpha=.75
|
||||
, colour='grey75'
|
||||
) + theme(
|
||||
axis.text.x = element_text(
|
||||
size = my_ats
|
||||
# , angle= 30
|
||||
)
|
||||
, axis.text.y = element_text(size = my_ats
|
||||
#, angle = 30
|
||||
, hjust = 1
|
||||
, vjust = 0)
|
||||
, axis.title.x = element_text(
|
||||
size = my_als
|
||||
, colour = 'black'
|
||||
)
|
||||
, axis.title.y = element_text(
|
||||
size = my_als
|
||||
, colour = 'black'
|
||||
)
|
||||
, legend.position = "top"
|
||||
, legend.text = element_text(size = my_als)
|
||||
|
||||
#) + geom_text(
|
||||
) + geom_label(
|
||||
aes(label = value)
|
||||
, size = 5
|
||||
, hjust = 0.5
|
||||
, vjust = 0.5
|
||||
, colour = 'black'
|
||||
, show.legend = FALSE
|
||||
#, check_overlap = TRUE
|
||||
, position = position_stack(reverse = T)
|
||||
#, position = ('
|
||||
|
||||
) + labs(
|
||||
title = ''
|
||||
, x = ''
|
||||
, y = "Number"
|
||||
, fill = 'Variable'
|
||||
, colour = 'black'
|
||||
) + scale_fill_manual(
|
||||
values = c('grey50', 'gray75')
|
||||
, name=''
|
||||
, labels=c('Mutations', 'Total Samples')
|
||||
) + scale_x_discrete(
|
||||
breaks = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
, labels = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
)
|
||||
print(printFile)
|
||||
dev.off()
|
|
@ -1,233 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for Lig #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df_lig.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
#############################
|
||||
# Extra sanity check:
|
||||
# for mcsm_lig ONLY
|
||||
# Dis_lig_Ang should be <10
|
||||
#############################
|
||||
|
||||
if (max(my_df$Dis_lig_Ang) < 10){
|
||||
print ("Sanity check passed: lig data is <10Ang")
|
||||
}else{
|
||||
print ("Error: data should be filtered to be within 10Ang")
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage Distribution
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages ) #2037 35
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#78 961 195 803
|
||||
|
||||
# when merged_df2_comp is used
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#77 955 194 770
|
||||
|
||||
length(unique(df_lin$Mutationinformation))
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
# sanity checks
|
||||
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
||||
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
|
||||
print ("sanity check passed: numbers match")
|
||||
} else{
|
||||
print("Error!: check your numbers")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# basic: could improve this!
|
||||
library(plotly)
|
||||
library(ggridges)
|
||||
|
||||
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
g <- ggplot(df, aes(x = ratioPredAff)) +
|
||||
geom_density(aes(fill = Lig_outcome)
|
||||
, alpha = 0.5) +
|
||||
facet_wrap( ~ lineage
|
||||
, scales = "free"
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian(xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
)
|
||||
ggtitle("Kernel Density estimates of Ligand affinity by lineage")
|
||||
|
||||
ggplotly(g)
|
||||
|
||||
# 2 : ggridges (good!)
|
||||
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
fooNames = c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames) = c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_dist_LIG.svg')
|
||||
|
||||
printFile = ggplot( df, aes(x = ratioPredAff
|
||||
, y = Lig_outcome) ) +
|
||||
|
||||
geom_density_ridges_gradient( aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
# , switch = 'x'
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
) +
|
||||
|
||||
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "Ligand Affinity" ) +
|
||||
theme( axis.text.x = element_text( size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
# , axis.text.y = element_text( size = my_ats
|
||||
# , angle = 0
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size = my_als)
|
||||
, legend.text = element_text(size = 10)
|
||||
, legend.title = element_text(size = my_als)
|
||||
# , legend.position = c(0.3, 0.8)
|
||||
# , legend.key.height = unit(1, 'mm')
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
||||
#=!=!=!=!=!=!
|
||||
# COMMENT: When you look at all mutations, the lineage differences disappear...
|
||||
# The pattern we are interested in is possibly only for dr_mutations
|
||||
#=!=!=!=!=!=!
|
||||
|
||||
#===================================================
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
head(df$lineage)
|
||||
df$lineage = as.character(df$lineage)
|
||||
|
||||
lin1 = df[df$lineage == "lineage1",]$ratioPredAff
|
||||
lin2 = df[df$lineage == "lineage2",]$ratioPredAff
|
||||
lin3 = df[df$lineage == "lineage3",]$ratioPredAff
|
||||
lin4 = df[df$lineage == "lineage4",]$ratioPredAff
|
||||
|
||||
# ks test
|
||||
ks.test(lin1,lin2)
|
||||
ks.test(lin1,lin3)
|
||||
ks.test(lin1,lin4)
|
||||
|
||||
ks.test(lin2,lin3)
|
||||
ks.test(lin2,lin4)
|
||||
|
||||
ks.test(lin3,lin4)
|
||||
|
||||
|
||||
|
|
@ -1,212 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting") # thinkpad
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("../Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
#require(data.table)
|
||||
|
||||
########################################################################
|
||||
# Read file: call script for combining df for PS #
|
||||
########################################################################
|
||||
|
||||
source("../combining_two_df.R")
|
||||
|
||||
#---------------------- PAY ATTENTION
|
||||
# the above changes the working dir
|
||||
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
||||
#---------------------- PAY ATTENTION
|
||||
|
||||
#==========================
|
||||
# This will return:
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp
|
||||
# merged_df3_comp
|
||||
#===========================
|
||||
|
||||
###########################
|
||||
# Data for plots
|
||||
# you need merged_df2 or merged_df2_comp
|
||||
# since this is one-many relationship
|
||||
# i.e the same SNP can belong to multiple lineages
|
||||
###########################
|
||||
|
||||
# uncomment as necessary
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df2
|
||||
#my_df = merged_df2_comp
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
# delete variables not required
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# quick checks
|
||||
colnames(my_df)
|
||||
str(my_df)
|
||||
|
||||
# Ensure correct data type in columns to plot: need to be factor
|
||||
is.factor(my_df$lineage)
|
||||
my_df$lineage = as.factor(my_df$lineage)
|
||||
is.factor(my_df$lineage)
|
||||
|
||||
table(my_df$mutation_info)
|
||||
|
||||
########################################################################
|
||||
# end of data extraction and cleaning for plots #
|
||||
########################################################################
|
||||
|
||||
#==========================
|
||||
# Plot: Lineage Distribution
|
||||
# x = mcsm_values, y = dist
|
||||
# fill = stability
|
||||
#============================
|
||||
|
||||
#===================
|
||||
# Data for plots
|
||||
#===================
|
||||
|
||||
# subset only lineages1-4
|
||||
sel_lineages = c("lineage1"
|
||||
, "lineage2"
|
||||
, "lineage3"
|
||||
, "lineage4")
|
||||
|
||||
# uncomment as necessary
|
||||
df_lin = subset(my_df, subset = lineage %in% sel_lineages )
|
||||
|
||||
# refactor
|
||||
df_lin$lineage = factor(df_lin$lineage)
|
||||
|
||||
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#104 1293 264 1311
|
||||
|
||||
# when merged_df2_comp is used
|
||||
#lineage1 lineage2 lineage3 lineage4
|
||||
#99 1275 263 1255
|
||||
|
||||
length(unique(df_lin$Mutationinformation))
|
||||
#{Result: No. of unique mutations the 4 lineages contribute to}
|
||||
|
||||
# sanity checks
|
||||
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
||||
if(sum(table(my_df$lineage)[r1]) == nrow(df_lin)) {
|
||||
print ("sanity check passed: numbers match")
|
||||
} else{
|
||||
print("Error!: check your numbers")
|
||||
}
|
||||
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
# REASSIGNMENT
|
||||
df <- df_lin
|
||||
#<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
rm(df_lin)
|
||||
|
||||
#******************
|
||||
# generate distribution plot of lineages
|
||||
#******************
|
||||
# basic: could improve this!
|
||||
library(plotly)
|
||||
library(ggridges)
|
||||
|
||||
g <- ggplot(df, aes(x = ratioDUET)) +
|
||||
geom_density(aes(fill = DUET_outcome)
|
||||
, alpha = 0.5) + facet_wrap(~ lineage,
|
||||
scales = "free") +
|
||||
ggtitle("Kernel Density estimates of Protein stability by lineage")
|
||||
|
||||
ggplotly(g)
|
||||
|
||||
# 2 : ggridges (good!)
|
||||
|
||||
my_ats = 15 # axis text size
|
||||
my_als = 20 # axis label size
|
||||
|
||||
fooNames=c('Lineage 1', 'Lineage 2', 'Lineage 3', 'Lineage 4')
|
||||
names(fooNames)=c('lineage1', 'lineage2', 'lineage3', 'lineage4')
|
||||
|
||||
# set output dir for plots
|
||||
getwd()
|
||||
setwd("~/git/Data/pyrazinamide/output/plots")
|
||||
getwd()
|
||||
|
||||
svg('lineage_dist_PS.svg')
|
||||
|
||||
printFile = ggplot( df, aes(x = ratioDUET
|
||||
, y = DUET_outcome) )+
|
||||
|
||||
#printFile=geom_density_ridges_gradient(
|
||||
geom_density_ridges_gradient( aes(fill = ..x..)
|
||||
, scale = 3
|
||||
, size = 0.3 ) +
|
||||
facet_wrap( ~lineage
|
||||
, scales = "free"
|
||||
# , switch = 'x'
|
||||
, labeller = labeller(lineage = fooNames) ) +
|
||||
coord_cartesian( xlim = c(-1, 1)
|
||||
# , ylim = c(0, 6)
|
||||
# , clip = "off"
|
||||
) +
|
||||
scale_fill_gradientn( colours = c("#f8766d", "white", "#00bfc4")
|
||||
, name = "DUET" ) +
|
||||
theme( axis.text.x = element_text( size = my_ats
|
||||
, angle = 90
|
||||
, hjust = 1
|
||||
, vjust = 0.4)
|
||||
# , axis.text.y = element_text( size = my_ats
|
||||
# , angle = 0
|
||||
# , hjust = 1
|
||||
# , vjust = 0)
|
||||
, axis.text.y = element_blank()
|
||||
, axis.title.x = element_blank()
|
||||
, axis.title.y = element_blank()
|
||||
, axis.ticks.y = element_blank()
|
||||
, plot.title = element_blank()
|
||||
, strip.text = element_text(size=my_als)
|
||||
, legend.text = element_text(size=10)
|
||||
, legend.title = element_text(size=my_als)
|
||||
# , legend.position = c(0.3, 0.8)
|
||||
# , legend.key.height = unit(1, 'mm')
|
||||
)
|
||||
|
||||
print(printFile)
|
||||
dev.off()
|
||||
|
||||
#=!=!=!=!=!=!
|
||||
# COMMENT: When you look at all mutations, the lineage differences disappear...
|
||||
# The pattern we are interested in is possibly only for dr_mutations
|
||||
#=!=!=!=!=!=!
|
||||
#===================================================
|
||||
|
||||
# COMPARING DISTRIBUTIONS
|
||||
head(df$lineage)
|
||||
df$lineage = as.character(df$lineage)
|
||||
|
||||
lin1 = df[df$lineage == "lineage1",]$ratioDUET
|
||||
lin2 = df[df$lineage == "lineage2",]$ratioDUET
|
||||
lin3 = df[df$lineage == "lineage3",]$ratioDUET
|
||||
lin4 = df[df$lineage == "lineage4",]$ratioDUET
|
||||
|
||||
# ks test
|
||||
ks.test(lin1,lin2)
|
||||
ks.test(lin1,lin3)
|
||||
ks.test(lin1,lin4)
|
||||
|
||||
ks.test(lin2,lin3)
|
||||
ks.test(lin2,lin4)
|
||||
|
||||
ks.test(lin3,lin4)
|
||||
|
||||
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
#########################
|
||||
#3: Read complex pdb file
|
||||
##########################
|
||||
source("Header_TT.R")
|
||||
# This script only reads the pdb file of your complex
|
||||
|
||||
# read in pdb file complex1
|
||||
inDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
inFile = paste0(inDir, "complex1_no_water.pdb")
|
||||
complex1 = inFile
|
||||
|
||||
#inFile2 = paste0(inDir, "complex2_no_water.pdb")
|
||||
#complex2 = inFile2
|
||||
|
||||
# list of 8
|
||||
my_pdb = read.pdb(complex1
|
||||
, maxlines = -1
|
||||
, multi = FALSE
|
||||
, rm.insert = FALSE
|
||||
, rm.alt = TRUE
|
||||
, ATOM.only = FALSE
|
||||
, hex = FALSE
|
||||
, verbose = TRUE)
|
||||
|
||||
rm(inDir, inFile, complex1)
|
||||
#====== end of script
|
||||
|
|
@ -1,386 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
|
||||
########################################################################
|
||||
# Installing and loading required packages #
|
||||
########################################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
|
||||
#########################################################
|
||||
# TASK: replace B-factors in the pdb file with normalised values
|
||||
# use the complex file with no water as mCSM lig was
|
||||
# performed on this file. You can check it in the script: read_pdb file.
|
||||
#########################################################
|
||||
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
|
||||
#=========================================================
|
||||
# Processing P1: Replacing B factor with mean ratioDUET scores
|
||||
#=========================================================
|
||||
|
||||
#########################
|
||||
# Read complex pdb file
|
||||
# form the R script
|
||||
##########################
|
||||
|
||||
source("read_pdb.R") # list of 8
|
||||
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#*******************************************
|
||||
# plot histograms for inspection
|
||||
# 1: original B-factors
|
||||
# 2: original DUET Scores
|
||||
# 3: replaced B-factors with DUET Scores
|
||||
#*********************************************
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
|
||||
#1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
# 2: DUET scores
|
||||
hist(my_df$average_DUETR
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
|
||||
plot(density(my_df$average_DUETR)
|
||||
, xlab = ""
|
||||
, main = "Norm_DUET")
|
||||
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
|
||||
#=========
|
||||
# step 0_P1: DONT RUN once you have double checked the matched output
|
||||
#=========
|
||||
# sanity check: match and assign to a separate column to double check
|
||||
# colnames(my_df)
|
||||
# d$ratioDUET = my_df$averge_DUETR[match(d$resno, my_df$Position)]
|
||||
|
||||
#=========
|
||||
# step 1_P1
|
||||
#=========
|
||||
# Be brave and replace in place now (don't run sanity check)
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_DUETR[match(d$resno, my_df$Position)]
|
||||
|
||||
#=========
|
||||
# step 2_P1
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
#table(d$b)
|
||||
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
|
||||
# sanity check: should be True
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
# sanity checks: should be True
|
||||
if(max(d$b) == max(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
if (min(d$b) == min(my_df$average_DUETR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
#=========
|
||||
# step 3_P1
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
|
||||
#=========
|
||||
# step 4_P1
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#=========
|
||||
# step 5_P1
|
||||
#=========
|
||||
# output dir
|
||||
getwd()
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
|
||||
outFile = paste0(outDir, "complex1_BwithNormDUET.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
||||
|
||||
#********************************
|
||||
# Add the 3rd histogram and density plots for comparisons
|
||||
#********************************
|
||||
# Plots continued...
|
||||
# 3: hist and density of replaced B-factors with DUET Scores
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "repalced-B")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "replaced-B")
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = "DUET_stability"
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
#********************************
|
||||
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# NOTE: This replaced B-factor distribution has the same
|
||||
# x-axis as the PredAff normalised values, but the distribution
|
||||
# is affected since 0 is overinflated. This is because all the positions
|
||||
# where there are no SNPs have been assigned 0.
|
||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#######################################################################
|
||||
#====================== end of section 1 ==============================
|
||||
#######################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#=========================================================
|
||||
# Processing P2: Replacing B values with PredAff Scores
|
||||
#=========================================================
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
|
||||
###########################
|
||||
# 2: Read file: average stability values
|
||||
# or mcsm_normalised file, output of step 4 mcsm pipeline
|
||||
###########################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mean_PS_Lig_Bfactor.csv"); inFile
|
||||
|
||||
my_df <- read.csv(inFile
|
||||
# , row.names = 1
|
||||
# , stringsAsFactors = F
|
||||
, header = T)
|
||||
str(my_df)
|
||||
#rm(inDir, inFile)
|
||||
|
||||
#########################
|
||||
# 3: Read complex pdb file
|
||||
# form the R script
|
||||
##########################
|
||||
|
||||
source("read_pdb.R") # list of 8
|
||||
|
||||
# extract atom list into a variable
|
||||
# since in the list this corresponds to data frame, variable will be a df
|
||||
d = my_pdb[[1]]
|
||||
|
||||
# make a copy: required for downstream sanity checks
|
||||
d2 = d
|
||||
|
||||
# sanity checks: B factor
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#*******************************************
|
||||
# plot histograms for inspection
|
||||
# 1: original B-factors
|
||||
# 2: original Pred Aff Scores
|
||||
# 3: replaced B-factors with PredAff Scores
|
||||
#********************************************
|
||||
# Set the margin on all sides
|
||||
par(oma = c(3,2,3,0)
|
||||
, mar = c(1,3,5,2)
|
||||
, mfrow = c(3,2))
|
||||
#par(mfrow = c(3,2))
|
||||
|
||||
# 1: Original B-factor
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "B-factor")
|
||||
|
||||
# 2: Pred Aff scores
|
||||
hist(my_df$average_PredAffR
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
|
||||
plot(density(my_df$average_PredAffR)
|
||||
, xlab = ""
|
||||
, main = "Norm_lig_average")
|
||||
|
||||
# 3: After the following replacement
|
||||
#********************************
|
||||
|
||||
#=================================================
|
||||
# Processing P2: Replacing B values with ratioPredAff scores
|
||||
#=================================================
|
||||
# use match to perform this replacement linking with "position no"
|
||||
# in the pdb file, this corresponds to column "resno"
|
||||
# in my_df, this corresponds to column "Position"
|
||||
|
||||
#=========
|
||||
# step 0_P2: DONT RUN once you have double checked the matched output
|
||||
#=========
|
||||
# sanity check: match and assign to a separate column to double check
|
||||
# colnames(my_df)
|
||||
# d$ratioPredAff = my_df$average_PredAffR[match(d$resno, my_df$Position)] #1384, 17
|
||||
|
||||
#=========
|
||||
# step 1_P2: BE BRAVE and replace in place now (don't run step 0)
|
||||
#=========
|
||||
# this makes all the B-factor values in the non-matched positions as NA
|
||||
d$b = my_df$average_PredAffR[match(d$resno, my_df$Position)]
|
||||
|
||||
#=========
|
||||
# step 2_P2
|
||||
#=========
|
||||
# count NA in Bfactor
|
||||
b_na = sum(is.na(d$b)) ; b_na
|
||||
|
||||
# count number of 0's in Bactor
|
||||
sum(d$b == 0)
|
||||
#table(d$b)
|
||||
|
||||
# replace all NA in b factor with 0
|
||||
d$b[is.na(d$b)] = 0
|
||||
|
||||
# sanity check: should be 0
|
||||
sum(is.na(d$b))
|
||||
|
||||
if (sum(d$b == 0) == b_na){
|
||||
print ("Sanity check passed: NA's replaced with 0's successfully")
|
||||
} else {
|
||||
print("Error: NA replacement NOT successful, Debug code!")
|
||||
}
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
# sanity checks: should be True
|
||||
if (max(d$b) == max(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
if (min(d$b) == min(my_df$average_PredAffR)){
|
||||
print("Sanity check passed: B-factors replaced correctly")
|
||||
} else {
|
||||
print ("Error: Debug code please")
|
||||
}
|
||||
|
||||
#=========
|
||||
# step 3_P2
|
||||
#=========
|
||||
# sanity check: dim should be same before reassignment
|
||||
# should be TRUE
|
||||
dim(d) == dim(d2)
|
||||
|
||||
#=========
|
||||
# step 4_P2
|
||||
#=========
|
||||
# assign it back to the pdb file
|
||||
my_pdb[[1]] = d
|
||||
|
||||
max(d$b); min(d$b)
|
||||
|
||||
#=========
|
||||
# step 5_P2
|
||||
#=========
|
||||
|
||||
# output dir
|
||||
outDir = "~/git/Data/pyrazinamide/input/structure/"
|
||||
outFile = paste0(outDir, "complex1_BwithNormLIG.pdb"); outFile
|
||||
write.pdb(my_pdb, outFile)
|
||||
|
||||
#********************************
|
||||
# Add the 3rd histogram and density plots for comparisons
|
||||
#********************************
|
||||
# Plots continued...
|
||||
# 3: hist and density of replaced B-factors with PredAff Scores
|
||||
hist(d$b
|
||||
, xlab = ""
|
||||
, main = "repalced-B")
|
||||
|
||||
plot(density(d$b)
|
||||
, xlab = ""
|
||||
, main = "replaced-B")
|
||||
|
||||
# graph titles
|
||||
mtext(text = "Frequency"
|
||||
, side = 2
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
mtext(text = "Lig_stability"
|
||||
, side = 3
|
||||
, line = 0
|
||||
, outer = TRUE)
|
||||
|
||||
#********************************
|
||||
|
||||
###########
|
||||
# end of output files with Bfactors
|
||||
##########
|
|
@ -1,257 +0,0 @@
|
|||
getwd()
|
||||
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts")
|
||||
getwd()
|
||||
|
||||
#########################################################
|
||||
# 1: Installing and loading required packages #
|
||||
#########################################################
|
||||
|
||||
source("Header_TT.R")
|
||||
#source("barplot_colour_function.R")
|
||||
|
||||
##########################################################
|
||||
# Checking: Entire data frame and for PS #
|
||||
##########################################################
|
||||
|
||||
###########################
|
||||
#2) Read file: combined one from the script
|
||||
###########################
|
||||
source("combining_two_df.R")
|
||||
|
||||
# df with NA:
|
||||
# merged_df2
|
||||
# merged_df3:
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp:
|
||||
# merged_df3_comp:
|
||||
|
||||
######################
|
||||
# You need to check it
|
||||
# with the merged_df3
|
||||
########################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#clear variables
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# should be true
|
||||
identical(my_df$Position, my_df$position)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data <- read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
str(mcsm_data)
|
||||
my_colnames = colnames(mcsm_data)
|
||||
|
||||
#====================================
|
||||
# subset my_df to include only the columns in mcsm data
|
||||
my_df2 = my_df[my_colnames]
|
||||
#====================================
|
||||
# compare the two
|
||||
head(mcsm_data$Mutationinformation)
|
||||
head(mcsm_data$Position)
|
||||
|
||||
head(my_df2$Mutationinformation)
|
||||
head(my_df2$Position)
|
||||
|
||||
# sort mcsm data by Mutationinformation
|
||||
mcsm_data_s = mcsm_data[order(mcsm_data$Mutationinformation),]
|
||||
head(mcsm_data_s$Mutationinformation)
|
||||
head(mcsm_data_s$Position)
|
||||
|
||||
# now compare: should be True, but is false....
|
||||
# possibly due to rownames!?!
|
||||
identical(mcsm_data_s, my_df2)
|
||||
|
||||
# from library dplyr
|
||||
setdiff(mcsm_data_s, my_df2)
|
||||
|
||||
#from lib compare
|
||||
compare(mcsm_data_s, my_df2) # seems rownames are the problem
|
||||
|
||||
# FIXME: automate this
|
||||
# write files: checked using meld and files are indeed identical
|
||||
#write.csv(mcsm_data_s, "mcsm_data_s.csv", row.names = F)
|
||||
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
||||
|
||||
|
||||
#====================================================== end of section 1
|
||||
|
||||
|
||||
|
||||
##########################################################
|
||||
# Checking: LIG(Filtered dataframe) #
|
||||
##########################################################
|
||||
|
||||
# clear workspace
|
||||
rm(list = ls())
|
||||
|
||||
###########################
|
||||
#3) Read file: combined_lig from the script
|
||||
###########################
|
||||
source("combining_two_df_lig.R")
|
||||
|
||||
# df with NA:
|
||||
# merged_df2 :
|
||||
# merged_df3:
|
||||
|
||||
# df without NA:
|
||||
# merged_df2_comp:
|
||||
# merged_df3_comp:
|
||||
|
||||
######################
|
||||
# You need to check it
|
||||
# with the merged_df3
|
||||
########################
|
||||
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
# REASSIGNMENT
|
||||
my_df = merged_df3
|
||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
#clear variables
|
||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
||||
|
||||
# should be true
|
||||
identical(my_df$Position, my_df$position)
|
||||
|
||||
#################################
|
||||
# Read file: normalised file
|
||||
# output of step 4 mcsm_pipeline
|
||||
#################################
|
||||
|
||||
inDir = "~/git/Data/pyrazinamide/input/processed/"
|
||||
inFile = paste0(inDir, "mcsm_complex1_normalised.csv"); inFile
|
||||
|
||||
mcsm_data <- read.csv(inFile
|
||||
, row.names = 1
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
str(mcsm_data)
|
||||
|
||||
###########################
|
||||
# 4a: Filter/subset data: ONLY for LIGand analysis
|
||||
# Lig plots < 10Ang
|
||||
# Filter the lig plots for Dis_to_lig < 10Ang
|
||||
###########################
|
||||
# sanity checks
|
||||
upos = unique(mcsm_data$Position)
|
||||
|
||||
# check range of distances
|
||||
max(mcsm_data$Dis_lig_Ang)
|
||||
min(mcsm_data$Dis_lig_Ang)
|
||||
|
||||
# Lig filtered: subset data to have only values less than 10 Ang
|
||||
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
||||
|
||||
rm(mcsm_data) #to avoid confusion
|
||||
|
||||
table(mcsm_data2$Dis_lig_Ang<10)
|
||||
table(mcsm_data2$Dis_lig_Ang>10)
|
||||
|
||||
max(mcsm_data2$Dis_lig_Ang)
|
||||
min(mcsm_data2$Dis_lig_Ang)
|
||||
|
||||
upos_f = unique(mcsm_data2$Position); upos_f
|
||||
|
||||
# colnames of df that you will need to subset the bigger df from
|
||||
my_colnames = colnames(mcsm_data2)
|
||||
#====================================
|
||||
# subset bigger df i.e my_df to include only the columns in mcsm data2
|
||||
my_df2 = my_df[my_colnames]
|
||||
|
||||
rm(my_df) #to avoid confusion
|
||||
#====================================
|
||||
# compare the two
|
||||
head(mcsm_data2$Mutationinformation)
|
||||
head(mcsm_data2$Position)
|
||||
|
||||
head(my_df2$Mutationinformation)
|
||||
head(my_df2$Position)
|
||||
|
||||
# sort mcsm data by Mutationinformation
|
||||
mcsm_data2_s = mcsm_data2[order(mcsm_data2$Mutationinformation),]
|
||||
head(mcsm_data2_s$Mutationinformation)
|
||||
head(mcsm_data2_s$Position)
|
||||
|
||||
# now compare: should be True, but is false....
|
||||
# possibly due to rownames!?!
|
||||
identical(mcsm_data2_s, my_df2)
|
||||
|
||||
# from library dplyr
|
||||
setdiff(mcsm_data2_s, my_df2)
|
||||
|
||||
# from library compare
|
||||
compare(mcsm_data2_s, my_df2) # seems rownames are the problem
|
||||
|
||||
#FIXME: automate this
|
||||
# write files: checked using meld and files are indeed identical
|
||||
#write.csv(mcsm_data2_s, "mcsm_data2_s.csv", row.names = F)
|
||||
#write.csv(my_df2, "my_df2.csv", row.names = F)
|
||||
|
||||
|
||||
##########################################################
|
||||
# extract and write output file for SNP posn: all #
|
||||
##########################################################
|
||||
|
||||
head(merged_df3$Position)
|
||||
|
||||
foo = merged_df3[order(merged_df3$Position),]
|
||||
head(foo$Position)
|
||||
|
||||
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
||||
|
||||
# sanity check:
|
||||
table(snp_pos_unique == combined_df$Position)
|
||||
|
||||
#=====================
|
||||
# write_output files
|
||||
#=====================
|
||||
outDir = "~/Data/pyrazinamide/input/processed/"
|
||||
|
||||
|
||||
outFile1 = paste0(outDir, "snp_pos_unique.txt"); outFile1
|
||||
print(paste0("Output file name and path will be:","", outFile1))
|
||||
|
||||
write.table(snp_pos_unique
|
||||
, outFile1
|
||||
, row.names = F
|
||||
, col.names = F)
|
||||
|
||||
##############################################################
|
||||
# extract and write output file for SNP posn: complete only #
|
||||
##############################################################
|
||||
head(merged_df3_comp$Position)
|
||||
|
||||
foo = merged_df3_comp[order(merged_df3_comp$Position),]
|
||||
head(foo$Position)
|
||||
|
||||
snp_pos_unique = unique(foo$Position); snp_pos_unique
|
||||
|
||||
# outDir = "~/Data/pyrazinamide/input/processed/" # already set
|
||||
|
||||
outFile2 = paste0(outDir, "snp_pos_unique_comp.txt")
|
||||
print(paste0("Output file name and path will be:", outFile2))
|
||||
|
||||
write.table(snp_pos_unique
|
||||
, outFile2
|
||||
, row.names = F
|
||||
, col.names = F)
|
||||
#============================== end of script
|
||||
|
||||
|
56
mcsm_na/examples.py
Executable file
56
mcsm_na/examples.py
Executable file
|
@ -0,0 +1,56 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
|
||||
from submit_mcsm_na import *
|
||||
from get_results_mcsm_na import *
|
||||
#%%#####################################################################
|
||||
#EXAMPLE RUN for different stages
|
||||
#=====================
|
||||
# STAGE: submit_mcsm_na.py
|
||||
#=====================
|
||||
my_host = 'http://biosig.unimelb.edu.au'
|
||||
my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
|
||||
print(my_prediction_url)
|
||||
|
||||
my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
|
||||
my_nuc_type = 'RNA'
|
||||
my_pdb_file = homedir + '/git/Data/streptomycin/input/gid_complex.pdb'
|
||||
my_mutation_list = homedir + '/git/LSHTM_analysis/mcsm_na/test_snps_b1.csv'
|
||||
my_suffix = 'TEST'
|
||||
|
||||
#----------------------------------------------
|
||||
# example 1: 2 snps in a file
|
||||
#----------------------------------------------
|
||||
submit_mcsm_na(host_url = my_host
|
||||
, pdb_file = my_pdb_file
|
||||
, mutation_list = my_mutation_list
|
||||
, nuc_type = my_nuc_type
|
||||
, prediction_url = my_prediction_url
|
||||
, output_dir = my_outdir
|
||||
, outfile_suffix = my_suffix)
|
||||
#%%###################################################################
|
||||
|
||||
#=====================
|
||||
# STAGE: get_results.py
|
||||
#=====================
|
||||
my_host = 'http://biosig.unimelb.edu.au'
|
||||
my_outdir = homedir + '/git/LSHTM_analysis/mcsm_na'
|
||||
|
||||
#----------------------------------------------
|
||||
# example 1: single url in a single file
|
||||
#----------------------------------------------
|
||||
my_url_file_single = homedir + '/git/LSHTM_analysis/mcsm_na/mcsm_na_temp/mcsm_na_result_url_gid_test_b1.txt'
|
||||
print(my_url_file_single)
|
||||
my_suffix = 'single'
|
||||
|
||||
get_results(url_file = my_url_file_single
|
||||
, host_url = my_host
|
||||
, output_dir = my_outdir
|
||||
, outfile_suffix = my_suffix)
|
135
mcsm_na/format_results_mcsm_na.py
Executable file
135
mcsm_na/format_results_mcsm_na.py
Executable file
|
@ -0,0 +1,135 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
|
||||
def format_mcsm_na_output(mcsm_na_output_tsv):
|
||||
"""
|
||||
@param mcsm_na_outputcsv: file containing mcsm_na_results for all muts
|
||||
which is the result of combining all mcsm_na batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
This is post run_get_results_mcsm_na.py
|
||||
Formatting df to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted csv for mcsm_na output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
mcsm_na_data_raw = pd.read_csv(mcsm_na_output_tsv, sep = '\t')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_na_data = mcsm_na_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = mcsm_na_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase and consistent colnames
|
||||
mcsm_na_data.columns
|
||||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
my_colnames_dict = {'PDB_FILE': 'pdb_file' # relevant info from this col will be extracted and the column discarded
|
||||
, 'CHAIN': 'chain' # {wild_type}<position>{mutant_type}
|
||||
, 'WILD_RES': 'wild_type' # one letter amino acid code
|
||||
, 'RES_POS': 'position' # number
|
||||
, 'MUT_RES': 'mutant_type' # one letter amino acid code
|
||||
, 'RSA': 'rsa' # single letter (caps)
|
||||
, 'PRED_DDG': 'mcsm_na_affinity'} # 3-letter code
|
||||
|
||||
mcsm_na_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
mcsm_na_data.columns
|
||||
|
||||
#%%============================================================================
|
||||
#############
|
||||
# create mutationinformation column
|
||||
#############
|
||||
mcsm_na_data['mutationinformation'] = mcsm_na_data['wild_type'] + mcsm_na_data.position.map(str) + mcsm_na_data['mutant_type']
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# Create col: mcsm_na_outcome
|
||||
#############
|
||||
# classification based on mcsm_na_affinity values
|
||||
print('Assigning col: mcsm_na_outcome based on mcsm_na_affinity')
|
||||
print('Sanity check:')
|
||||
# count positive values in the mcsm_na_affinity column
|
||||
c = mcsm_na_data[mcsm_na_data['mcsm_na_affinity']>=0].count()
|
||||
mcsm_na_pos = c.get(key = 'mcsm_na_affinity')
|
||||
|
||||
# Assign category based on sign (+ve : I_affinity, -ve: R_affinity)
|
||||
mcsm_na_data['mcsm_na_outcome'] = np.where(mcsm_na_data['mcsm_na_affinity']>=0, 'Increased_affinity', 'Reduced_affinity')
|
||||
print('mcsm_na Outcome:', mcsm_na_data['mcsm_na_outcome'].value_counts())
|
||||
|
||||
#if mcsm_na_pos == mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']:
|
||||
# print('PASS: mcsm_na_outcome assigned correctly')
|
||||
#else:
|
||||
# print('FAIL: mcsm_na_outcome assigned incorrectly'
|
||||
# , '\nExpected no. of Increased_affinity mutations:', mcsm_na_pos
|
||||
# , '\nGot no. of Increased affinity mutations', mcsm_na_data['mcsm_na_outcome'].value_counts()['Increased_affinity']
|
||||
# , '\n======================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# scale mcsm_na values
|
||||
#############
|
||||
# Rescale values in mcsm_na_affinity col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
mcsm_na_min = mcsm_na_data['mcsm_na_affinity'].min()
|
||||
mcsm_na_max = mcsm_na_data['mcsm_na_affinity'].max()
|
||||
|
||||
mcsm_na_scale = lambda x : x/abs(mcsm_na_min) if x < 0 else (x/mcsm_na_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_na_data['mcsm_na_scaled'] = mcsm_na_data['mcsm_na_affinity'].apply(mcsm_na_scale)
|
||||
print('Raw mcsm_na scores:\n', mcsm_na_data['mcsm_na_affinity']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled mcsm_na scores:\n', mcsm_na_data['mcsm_na_scaled'])
|
||||
|
||||
c2 = mcsm_na_data[mcsm_na_data['mcsm_na_scaled']>=0].count()
|
||||
mcsm_na_pos2 = c2.get(key = 'mcsm_na_affinity')
|
||||
|
||||
if mcsm_na_pos == mcsm_na_pos2:
|
||||
print('\nPASS: Affinity values scaled correctly')
|
||||
else:
|
||||
print('\nFAIL: Affinity values scaled numbers MISmatch'
|
||||
, '\nExpected number:', mcsm_na_pos
|
||||
, '\nGot:', mcsm_na_pos2
|
||||
, '\n======================================================')
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
mcsm_na_data.columns
|
||||
mcsm_na_dataf = mcsm_na_data[['mutationinformation'
|
||||
, 'mcsm_na_affinity'
|
||||
, 'mcsm_na_scaled'
|
||||
, 'mcsm_na_outcome'
|
||||
, 'rsa'
|
||||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'chain'
|
||||
, 'pdb_file']]
|
||||
return(mcsm_na_dataf)
|
||||
#%%#####################################################################
|
||||
|
52
mcsm_na/get_results_mcsm_na.py
Executable file
52
mcsm_na/get_results_mcsm_na.py
Executable file
|
@ -0,0 +1,52 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
|
||||
def get_results(url_file, host_url, output_dir, outfile_suffix):
|
||||
# initilialise empty df
|
||||
#mcsm_na_results_out_df = pd.DataFrame()
|
||||
with open(url_file, 'r') as f:
|
||||
for count, line in enumerate(f):
|
||||
line = line.strip()
|
||||
print('URL no.', count+1, '\n', line)
|
||||
|
||||
#============================
|
||||
# Writing results file: csv
|
||||
#============================
|
||||
mcsm_na_results_dir = output_dir + '/mcsm_na_results'
|
||||
if not os.path.exists(mcsm_na_results_dir):
|
||||
print('\nCreating dir: mcsm_na_results within:', output_dir )
|
||||
os.makedirs(mcsm_na_results_dir)
|
||||
|
||||
# Download the .txt
|
||||
prediction_number = re.search(r'([0-9]+\.[0-9]+$)', line).group(0)
|
||||
print('CHECK prediction no:', prediction_number)
|
||||
txt_url = f"{host_url}/mcsm_na/static/results/" + prediction_number + '.txt'
|
||||
print('CHECK txt url:', txt_url)
|
||||
|
||||
out_filename = mcsm_na_results_dir + '/' + outfile_suffix + '_output_' + prediction_number + '.txt.gz'
|
||||
response_txt = requests.get(txt_url, stream = True)
|
||||
if response_txt.status_code == 200:
|
||||
print('\nDownloading .txt:', txt_url
|
||||
, '\n\nSaving file as:', out_filename)
|
||||
with open(out_filename, 'wb') as f:
|
||||
f.write(response_txt.raw.read())
|
||||
|
||||
#%%#####################################################################
|
||||
|
BIN
mcsm_na/mcsm_na_results/single_output_1613147445.16.txt
Normal file
BIN
mcsm_na/mcsm_na_results/single_output_1613147445.16.txt
Normal file
Binary file not shown.
1
mcsm_na/mcsm_na_temp/mcsm_na_result_url_TEST.txt
Normal file
1
mcsm_na/mcsm_na_temp/mcsm_na_result_url_TEST.txt
Normal file
|
@ -0,0 +1 @@
|
|||
http://biosig.unimelb.edu.au/mcsm_na/results_prediction/1613147445.16
|
78
mcsm_na/run_format_results_mcsm_na.py
Executable file
78
mcsm_na/run_format_results_mcsm_na.py
Executable file
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
|
||||
from format_results_mcsm_na import *
|
||||
########################################################################
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
||||
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
||||
|
||||
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#%%============================================================================
|
||||
# variable assignment: input and output paths & filenames
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
#outdir_ppi2 = args.mkdir_name
|
||||
make_dirs = args.make_dirs
|
||||
|
||||
#=======
|
||||
# dirs
|
||||
#=======
|
||||
if not datadir:
|
||||
datadir = homedir + '/git/Data/'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + drug + '/input/'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
||||
#if not mkdir_name:
|
||||
# outdir_na = outdir + 'mcsm_na_results/'
|
||||
|
||||
outdir_na = outdir + 'mcsm_na_results/'
|
||||
|
||||
# Input file
|
||||
infile_mcsm_na = outdir_na + gene.lower() + '_output_combined_clean.tsv'
|
||||
|
||||
# Formatted output file
|
||||
outfile_mcsm_na_f = outdir_na + gene.lower() + '_complex_mcsm_na_norm.csv'
|
||||
|
||||
#===========================================
|
||||
# CALL: format_results_mcsm_na()
|
||||
# Data: gid+streptomycin
|
||||
# Data: rpob+rifampicin, date: 18/11/2021
|
||||
#===========================================
|
||||
print('Formatting results for:', infile_mcsm_na)
|
||||
mcsm_na_df_f = format_mcsm_na_output(mcsm_na_output_tsv = infile_mcsm_na)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_na_df_f.to_csv(outfile_mcsm_na_f, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_mcsm_na_f
|
||||
, '\nExpected no. of rows:', len(mcsm_na_df_f)
|
||||
, '\nExpected no. of cols:', len(mcsm_na_df_f.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
#%%#####################################################################
|
42
mcsm_na/run_get_results_mcsm_na.py
Executable file
42
mcsm_na/run_get_results_mcsm_na.py
Executable file
|
@ -0,0 +1,42 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
|
||||
from get_results_mcsm_na import *
|
||||
########################################################################
|
||||
# variables
|
||||
my_host = 'http://biosig.unimelb.edu.au'
|
||||
|
||||
# TODO: add cmd line args
|
||||
#gene = 'gid'
|
||||
drug = 'streptomycin'
|
||||
datadir = homedir + '/git/Data'
|
||||
indir = datadir + '/' + drug + '/input'
|
||||
outdir = datadir + '/' + drug + '/output'
|
||||
|
||||
#==============================================================================
|
||||
# batch 26: 25.txt, RETRIEVED: 16 Feb:
|
||||
# batch 27: 26.txt, RETRIEVED: 6 Aug:
|
||||
my_url_file = outdir + '/mcsm_na_temp/mcsm_na_result_url_gid_b27.txt'
|
||||
my_suffix = 'gid_b27'
|
||||
|
||||
#==============================================================================
|
||||
|
||||
#==========================
|
||||
# CALL: get_results()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('Downloading results for:', my_url_file, '\nsuffix:', my_suffix)
|
||||
|
||||
get_results(url_file = my_url_file
|
||||
, host_url = my_host
|
||||
, output_dir = outdir
|
||||
, outfile_suffix = my_suffix)
|
||||
#%%#####################################################################
|
49
mcsm_na/run_submit_mcsm_na.py
Executable file
49
mcsm_na/run_submit_mcsm_na.py
Executable file
|
@ -0,0 +1,49 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os
|
||||
homedir = os.path.expanduser('~')
|
||||
os.chdir (homedir + '/git/LSHTM_analysis/mcsm_na')
|
||||
from submit_mcsm_na import *
|
||||
########################################################################
|
||||
# variables
|
||||
my_host = 'http://biosig.unimelb.edu.au'
|
||||
my_prediction_url = f"{my_host}/mcsm_na/run_prediction_list"
|
||||
print(my_prediction_url)
|
||||
|
||||
# TODO: add cmd line args
|
||||
#gene = 'gid'
|
||||
drug = ''
|
||||
datadir = homedir + '/git/Data/'
|
||||
indir = datadir + drug + 'input/'
|
||||
outdir = datadir + drug + 'output/'
|
||||
outdir_mcsm_na = outdir + 'mcsm_na_results/'
|
||||
|
||||
my_nuc_type = 'RNA'
|
||||
my_pdb_file = indir + gene.lower() + '_complex.pdb'
|
||||
|
||||
#=============================================================================
|
||||
# batch 26: 25.txt # RAN: 16 Feb:
|
||||
# batch 27: 26.txt # RAN: 6 Aug:
|
||||
# off by one
|
||||
my_mutation_list = outdir + '/snp_batches/20/snp_batch_26.txt'
|
||||
my_suffix = 'gid_b27'
|
||||
#==============================================================================
|
||||
|
||||
#==========================
|
||||
# CALL: submit_mcsm_na()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
submit_mcsm_na(host_url = my_host
|
||||
, pdb_file = my_pdb_file
|
||||
, mutation_list = my_mutation_list
|
||||
, nuc_type = my_nuc_type
|
||||
, prediction_url = my_prediction_url
|
||||
, output_dir = outdir_mcsm_na
|
||||
, outfile_suffix = my_suffix)
|
||||
#%%#####################################################################
|
27
mcsm_na/split_csv.sh
Executable file
27
mcsm_na/split_csv.sh
Executable file
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
|
||||
|
||||
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
|
||||
# copy your snp file to split into the mcsm_na dir
|
||||
|
||||
INFILE=$1
|
||||
OUTDIR=$2
|
||||
CHUNK=$3
|
||||
|
||||
mkdir -p ${OUTDIR}/${CHUNK}
|
||||
cd ${OUTDIR}/${CHUNK}
|
||||
|
||||
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
|
||||
|
||||
# use case
|
||||
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh gid_mcsm_formatted_snps.csv snp_batches 50
|
||||
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh embb_mcsm_formatted_snps.csv snp_batches 50
|
||||
|
||||
|
||||
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh rpob_mcsm_formatted_snps_chain.csv snp_batches 20 # date: 17/11/2021
|
||||
|
||||
|
||||
#acccidently replaced file original rpob batches
|
||||
|
||||
#~/git/LSHTM_analysis/mcsm_na/split_csv.sh 5uhc_mcsm_formatted_snps_chain.csv snp_batches_5uhc 20 # date: 17/11/2021
|
19
mcsm_na/split_format_csv.sh
Executable file
19
mcsm_na/split_format_csv.sh
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
# FIXME: This is written for expediency to kickstart running dynamut and mcsm-NA
|
||||
|
||||
# Usage: ~/git/LSHTM_analysis/dynamut/split_csv.sh <input file> <output dir> <chunk size in lines>
|
||||
# copy your snp file to split into the mcsm_na dir
|
||||
|
||||
INFILE=$1
|
||||
OUTDIR=$2
|
||||
CHUNK=$3
|
||||
|
||||
mkdir -p ${OUTDIR}/${CHUNK}
|
||||
cd ${OUTDIR}/${CHUNK}
|
||||
|
||||
split ../../${INFILE} -l ${CHUNK} -d snp_batch_
|
||||
for i in *; do mv $i $i.txt; done
|
||||
sed -i 's/^/A /g' *.txt
|
||||
|
||||
|
84
mcsm_na/submit_mcsm_na.py
Executable file
84
mcsm_na/submit_mcsm_na.py
Executable file
|
@ -0,0 +1,84 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
#%%#####################################################################
|
||||
def submit_mcsm_na(host_url
|
||||
, pdb_file
|
||||
, mutation_list
|
||||
, nuc_type
|
||||
, prediction_url
|
||||
, output_dir
|
||||
, outfile_suffix
|
||||
):
|
||||
"""
|
||||
Makes a POST request for mcsm_na predictions.
|
||||
|
||||
@param host_url: valid host url for submitting the job
|
||||
@type string
|
||||
|
||||
@param pdb_file: valid path to pdb structure
|
||||
@type string
|
||||
|
||||
@param mutation_list: list of mutations (1 per line) of the format:{chain} {WT}<POS>{Mut} [A X1Z}
|
||||
@type string
|
||||
|
||||
@param nuc_type: Nucleic acid type
|
||||
@type string
|
||||
|
||||
@param prediction_url: mcsm_na url for prediction
|
||||
@type string
|
||||
|
||||
@param output_dir: output dir
|
||||
@type string
|
||||
|
||||
@param outfile_suffix: outfile_suffix
|
||||
@type string
|
||||
|
||||
@return writes a .txt file containing url for the snps processed with user provided suffix in filename
|
||||
@type string
|
||||
"""
|
||||
|
||||
with open(pdb_file, "rb") as pdb_file, open (mutation_list, "rb") as mutation_list:
|
||||
files = {"wild": pdb_file
|
||||
, "mutation_list": mutation_list}
|
||||
body = {"na_type": nuc_type
|
||||
,"pred_type": 'list',
|
||||
"pdb_code": ''} # apparently needs it even though blank!
|
||||
|
||||
response = requests.post(prediction_url, files = files, data = body)
|
||||
print(response.status_code)
|
||||
if response.history:
|
||||
print('\nPASS: valid submission. Fetching result url')
|
||||
url_match = re.search('/mcsm_na/results_prediction/.+(?=")', response.text)
|
||||
url = host_url + url_match.group()
|
||||
print('\nURL for snp batch no ', str(outfile_suffix), ':', url)
|
||||
|
||||
#===============
|
||||
# writing file: result urls
|
||||
#===============
|
||||
mcsm_na_temp_dir = output_dir + '/mcsm_na_temp' # creates a temp dir within output_dir
|
||||
if not os.path.exists(mcsm_na_temp_dir):
|
||||
print('\nCreating mcsm_na_temp in output_dir', output_dir )
|
||||
os.makedirs(mcsm_na_temp_dir)
|
||||
|
||||
out_url_file = mcsm_na_temp_dir + '/mcsm_na_result_url_' + str(outfile_suffix) + '.txt'
|
||||
print('\nWriting output url file:', out_url_file)
|
||||
myfile = open(out_url_file, 'a')
|
||||
myfile.write(url)
|
||||
myfile.close()
|
||||
#%%#####################################################################
|
2
mcsm_na/test_snps_b1.csv
Normal file
2
mcsm_na/test_snps_b1.csv
Normal file
|
@ -0,0 +1,2 @@
|
|||
A P3S
|
||||
A I4N
|
|
158
mcsm_ppi2/format_results_mcsm_ppi2.py
Executable file
158
mcsm_ppi2/format_results_mcsm_ppi2.py
Executable file
|
@ -0,0 +1,158 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Aug 19 14:33:51 2020
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import os,sys
|
||||
homedir = os.path.expanduser('~')
|
||||
import subprocess
|
||||
import argparse
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pandas.api.types import is_string_dtype
|
||||
from pandas.api.types import is_numeric_dtype
|
||||
|
||||
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
|
||||
from reference_dict import up_3letter_aa_dict
|
||||
from reference_dict import oneletter_aa_dict
|
||||
#%%============================================================================
|
||||
|
||||
def format_mcsm_ppi2_output(mcsm_ppi2_output_csv):
|
||||
"""
|
||||
@param mcsm_ppi2_output_csv: file containing mcsm_ppi2_results for all mcsm snps
|
||||
which is the result of combining all mcsm_ppi2 batch results, and using
|
||||
bash scripts to combine all the batch results into one file.
|
||||
Formatting df to a pandas df and output as csv.
|
||||
@type string
|
||||
|
||||
@return (not true) formatted csv for mcsm_ppi2 output
|
||||
@type pandas df
|
||||
|
||||
"""
|
||||
#############
|
||||
# Read file
|
||||
#############
|
||||
mcsm_ppi2_data_raw = pd.read_csv(mcsm_ppi2_output_csv, sep = ',')
|
||||
|
||||
# strip white space from both ends in all columns
|
||||
mcsm_ppi2_data = mcsm_ppi2_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
|
||||
|
||||
dforig_shape = mcsm_ppi2_data.shape
|
||||
print('dimensions of input file:', dforig_shape)
|
||||
|
||||
#############
|
||||
# Map 3 letter
|
||||
# code to one
|
||||
#############
|
||||
# initialise a sub dict that is lookup dict for
|
||||
# 3-LETTER aa code to 1-LETTER aa code
|
||||
lookup_dict = dict()
|
||||
for k, v in up_3letter_aa_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
wt = mcsm_ppi2_data['wild-type'].squeeze() # converts to a series that map works on
|
||||
mcsm_ppi2_data['w_type'] = wt.map(lookup_dict)
|
||||
mut = mcsm_ppi2_data['mutant'].squeeze()
|
||||
mcsm_ppi2_data['m_type'] = mut.map(lookup_dict)
|
||||
|
||||
# #############
|
||||
# # CHECK
|
||||
# # Map 1 letter
|
||||
# # code to 3Upper
|
||||
# #############
|
||||
# # initialise a sub dict that is lookup dict for
|
||||
# # 3-LETTER aa code to 1-LETTER aa code
|
||||
# lookup_dict = dict()
|
||||
# for k, v in oneletter_aa_dict.items():
|
||||
# lookup_dict[k] = v['three_letter_code_upper']
|
||||
# wt = mcsm_ppi2_data['w_type'].squeeze() #converts to a series that map works on
|
||||
# mcsm_ppi2_data['WILD'] = wt.map(lookup_dict)
|
||||
# mut = mcsm_ppi2_data['m_type'].squeeze()
|
||||
# mcsm_ppi2_data['MUT'] = mut.map(lookup_dict)
|
||||
|
||||
# # check
|
||||
# mcsm_ppi2_data['wild-type'].equals(mcsm_ppi2_data['WILD'])
|
||||
# mcsm_ppi2_data['mutant'].equals(mcsm_ppi2_data['MUT'])
|
||||
#%%============================================================================
|
||||
#############
|
||||
# rename cols
|
||||
#############
|
||||
# format colnames: all lowercase and consistent colnames
|
||||
mcsm_ppi2_data.columns
|
||||
print('Assigning meaningful colnames'
|
||||
, '\n=======================================================')
|
||||
|
||||
my_colnames_dict = {'chain': 'chain'
|
||||
, 'wild-type': 'wt_upper'
|
||||
, 'res-number': 'position'
|
||||
, 'mutant': 'mut_upper'
|
||||
, 'distance-to-interface': 'interface_dist'
|
||||
, 'mcsm-ppi2-prediction': 'mcsm_ppi2_affinity'
|
||||
, 'affinity': 'mcsm_ppi2_outcome'
|
||||
, 'w_type': 'wild_type' # one letter amino acid code
|
||||
, 'm_type': 'mutant_type' # one letter amino acid code
|
||||
}
|
||||
|
||||
mcsm_ppi2_data.rename(columns = my_colnames_dict, inplace = True)
|
||||
mcsm_ppi2_data.columns
|
||||
|
||||
#############
|
||||
# create mutationinformation column
|
||||
#############
|
||||
#mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data['wild_type'] + mcsm_ppi2_data.position.map(str) + mcsm_ppi2_data['mutant_type']
|
||||
mcsm_ppi2_data['mutationinformation'] = mcsm_ppi2_data.loc[:,'wild_type'] + mcsm_ppi2_data.loc[:,'position'].astype(int).apply(str) + mcsm_ppi2_data.loc[:,'mutant_type']
|
||||
|
||||
#%%=====================================================================
|
||||
#########################
|
||||
# scale mcsm_ppi2 values
|
||||
#########################
|
||||
# Rescale values in mcsm_ppi2_affinity col b/w -1 and 1 so negative numbers
|
||||
# stay neg and pos numbers stay positive
|
||||
mcsm_ppi2_min = mcsm_ppi2_data['mcsm_ppi2_affinity'].min()
|
||||
mcsm_ppi2_max = mcsm_ppi2_data['mcsm_ppi2_affinity'].max()
|
||||
|
||||
mcsm_ppi2_scale = lambda x : x/abs(mcsm_ppi2_min) if x < 0 else (x/mcsm_ppi2_max if x >= 0 else 'failed')
|
||||
|
||||
mcsm_ppi2_data['mcsm_ppi2_scaled'] = mcsm_ppi2_data['mcsm_ppi2_affinity'].apply(mcsm_ppi2_scale)
|
||||
print('Raw mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_affinity']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled mcsm_ppi2 scores:\n', mcsm_ppi2_data['mcsm_ppi2_scaled'])
|
||||
|
||||
c = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_affinity']>=0].count()
|
||||
mcsm_ppi2_pos = c.get(key = 'mcsm_ppi2_affinity')
|
||||
|
||||
c2 = mcsm_ppi2_data[mcsm_ppi2_data['mcsm_ppi2_scaled']>=0].count()
|
||||
mcsm_ppi2_pos2 = c2.get(key = 'mcsm_ppi2_scaled')
|
||||
|
||||
if mcsm_ppi2_pos == mcsm_ppi2_pos2:
|
||||
print('\nPASS: Affinity values scaled correctly')
|
||||
else:
|
||||
print('\nFAIL: Affinity values scaled numbers MISmatch'
|
||||
, '\nExpected number:', mcsm_ppi2_pos
|
||||
, '\nGot:', mcsm_ppi2_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
#%%=====================================================================
|
||||
#############
|
||||
# reorder columns
|
||||
#############
|
||||
mcsm_ppi2_data.columns
|
||||
mcsm_ppi2_dataf = mcsm_ppi2_data[['mutationinformation'
|
||||
, 'mcsm_ppi2_affinity'
|
||||
, 'mcsm_ppi2_scaled'
|
||||
, 'mcsm_ppi2_outcome'
|
||||
, 'interface_dist'
|
||||
, 'wild_type'
|
||||
, 'position'
|
||||
, 'mutant_type'
|
||||
, 'wt_upper'
|
||||
, 'mut_upper'
|
||||
, 'chain']]
|
||||
return(mcsm_ppi2_dataf)
|
||||
#%%#####################################################################
|
82
mcsm_ppi2/run_format_results_mcsm_ppi2.py
Executable file
82
mcsm_ppi2/run_format_results_mcsm_ppi2.py
Executable file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 12 12:15:26 2021
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
#%% load packages
|
||||
import sys, os
|
||||
homedir = os.path.expanduser('~')
|
||||
#sys.path.append(homedir + '/git/LSHTM_analysis/mcsm_ppi2')
|
||||
|
||||
from format_results_mcsm_ppi2 import *
|
||||
########################################################################
|
||||
#%% command line args
|
||||
arg_parser = argparse.ArgumentParser()
|
||||
arg_parser.add_argument('-d', '--drug' , help = 'drug name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('-g', '--gene' , help = 'gene name (case sensitive)', default = None)
|
||||
arg_parser.add_argument('--datadir' , help = 'Data Directory. By default, it assmumes homedir + git/Data')
|
||||
arg_parser.add_argument('-i', '--input_dir' , help = 'Input dir containing pdb files. By default, it assmumes homedir + <drug> + input')
|
||||
arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
arg_parser.add_argument('--input_file' , help = 'Output dir for results. By default, it assmes homedir + <drug> + output')
|
||||
|
||||
#arg_parser.add_argument('--mkdir_name' , help = 'Output dir for processed results. This will be created if it does not exist')
|
||||
arg_parser.add_argument('-m', '--make_dirs' , help = 'Make dir for input and output', action='store_true')
|
||||
arg_parser.add_argument('--debug' , action = 'store_true' , help = 'Debug Mode')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
#%%============================================================================
|
||||
# variable assignment: input and output paths & filenames
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
datadir = args.datadir
|
||||
indir = args.input_dir
|
||||
outdir = args.output_dir
|
||||
infile_mcsm_ppi2 = args.input_file
|
||||
|
||||
#outdir_ppi2 = args.mkdir_name
|
||||
make_dirs = args.make_dirs
|
||||
|
||||
#=======
|
||||
# dirs
|
||||
#=======
|
||||
if not datadir:
|
||||
datadir = homedir + '/git/Data/'
|
||||
|
||||
if not indir:
|
||||
indir = datadir + drug + '/input/'
|
||||
|
||||
if not outdir:
|
||||
outdir = datadir + drug + '/output/'
|
||||
|
||||
#if not mkdir_name:
|
||||
# outdir_ppi2 = outdir + 'mcsm_ppi2/'
|
||||
|
||||
outdir_ppi2 = outdir + 'mcsm_ppi2/'
|
||||
|
||||
# Input file
|
||||
if not infile_mcsm_ppi2:
|
||||
infile_mcsm_ppi2 = outdir_ppi2 + gene.lower() + '_output_combined_clean.csv'
|
||||
|
||||
# Formatted output file
|
||||
outfile_mcsm_ppi2_f = outdir_ppi2 + gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||
|
||||
#==========================
|
||||
# CALL: format_results_mcsm_na()
|
||||
# Data: gid+streptomycin
|
||||
#==========================
|
||||
print('Formatting results for:', infile_mcsm_ppi2)
|
||||
mcsm_ppi2_df_f = format_mcsm_ppi2_output(mcsm_ppi2_output_csv = infile_mcsm_ppi2)
|
||||
|
||||
# writing file
|
||||
print('Writing formatted df to csv')
|
||||
mcsm_ppi2_df_f.to_csv(outfile_mcsm_ppi2_f, index = False)
|
||||
|
||||
print('Finished writing file:'
|
||||
, '\nFile:', outfile_mcsm_ppi2_f
|
||||
, '\nExpected no. of rows:', len(mcsm_ppi2_df_f)
|
||||
, '\nExpected no. of cols:', len(mcsm_ppi2_df_f.columns)
|
||||
, '\n=============================================================')
|
||||
|
||||
#%%#####################################################################
|
|
@ -1,512 +0,0 @@
|
|||
, stringsAsFactors = F)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
|
||||
my_logor
|
||||
pnca_snps_or$Mutationinformation == i
|
||||
View(pnca_snps_or)
|
||||
#===============
|
||||
# Step 4: Calculate for one snp
|
||||
# using i, when you run the loop, it is easy
|
||||
#===============
|
||||
i = "pnca_p.trp68gly"
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
# uncomment as necessary
|
||||
pnca_snps_or = pnca_snps_or[1:5,]
|
||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
||||
#===============
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
pnca_snps_or = pnca_snps_or[1:5,]
|
||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
||||
pnca_snps_or = pnca_snps_or[1:5]
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
pnca_snps_or = pnca_snps_or[1:5]
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
foo = pnca_snps_or[c(1:5,)]
|
||||
foo = pnca_snps_or[c(1:5),]
|
||||
foo = as.data.frame(pnca_snps_or[c(1:5),])
|
||||
View(foo)
|
||||
# create an empty dataframe
|
||||
pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR, but only for one snp
|
||||
# this is test before you apply it all others
|
||||
#===============
|
||||
pnca_snps_or$mutation == i
|
||||
View(pnca_snps_or)
|
||||
# create an empty dataframe
|
||||
pnca_snps_or = data.frame(mutation = i)
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
View(pnca_snps_or_copy)
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR, but only for one snp
|
||||
# this is test before you apply it all others
|
||||
#===============
|
||||
#reset original df so you don't make a mistake
|
||||
pnca_snps_or = pnca_snps_or_copy
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
}
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
||||
View(pnca_snps_or)
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
View(pnca_snps_or)
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
warnings()
|
||||
View(pnca_snps_or)
|
||||
View(pnca_snps_or_copy)
|
||||
#sanity check
|
||||
pnca_snps_or$mutation == i1
|
||||
#sanity check
|
||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
||||
pnca_snps_or1 = unique(pnca_snps_or)
|
||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
||||
# you only need it for the unique mutations
|
||||
pnca_snps_or = unique(pnca_snps_or) #2133, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
View(pnca_snps_or)
|
||||
2.290256e+01
|
||||
1.561132e+06
|
||||
3.242285e-04
|
||||
#sanity check
|
||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
View(my_data)
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
snps_all = unique(my_data$mutation)# 337
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or = as.data.frame(snps_all)
|
||||
View(pnca_snps_or)
|
||||
snps_all[-"true_wt"]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
View(pnca_snps_or)
|
||||
snps_all = as.data.frame(snps_all)
|
||||
View(snps_all)
|
||||
#remove true_wt entry
|
||||
w1 = which(rownames(snps_all) == "true_wt")
|
||||
View(snps_all)
|
||||
#remove true_wt entry
|
||||
w1 = which(snps_all$snps_all == "true_wt")
|
||||
rm(pnca_snps_or)
|
||||
pnca_snps_or = snps_all[-w1]
|
||||
pnca_snps_or = snps_all[,-w1]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
#remove true_wt entry
|
||||
w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
snps_all = unique(my_data$mutation)# 337
|
||||
snps_all = as.data.frame(snps_all)
|
||||
snps_all[-c(1,1)]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
|
||||
#remove true_wt entry
|
||||
#w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or = pnca_snps_or_copy
|
||||
#remove true_wt entry
|
||||
#w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or -> pnca_snps_or_copy
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR for each snp
|
||||
# and assign to the pnca_snps_or df that has
|
||||
# each row as a unique snp
|
||||
#===============
|
||||
# reset original df so you don't make a mistake: IMPORTANT
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
# you only need it for the unique mutations
|
||||
pnca_snps_or = unique(pnca_snps_or) #337, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
|
||||
setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
|
||||
getwd()
|
||||
#===============
|
||||
# Step 1: read raw data
|
||||
#===============
|
||||
raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
|
||||
,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
|
||||
raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
|
||||
# combine the two mutation columns
|
||||
raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
|
||||
head(raw_data$all_mutations_pyrazinamide)
|
||||
# create yet another column that contains all the mutations but in lower case
|
||||
raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
|
||||
table(grepl("pnca_p",raw_data$all_muts_pza))
|
||||
#FALSE TRUE
|
||||
#10603 1908
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
# subset a snall section to test
|
||||
#pnca_snps_or_copy = pnca_snps_or
|
||||
#pnca_snps_or = pnca_snps_or_copy
|
||||
pnca_snps_unique = unique(pnca_snps_or$mutation) #293
|
||||
i2 = "pnca_p.trp68gly" # Should exist
|
||||
grep(i2, pnca_snps_unique)
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
head(my_data$mutation)
|
||||
my_data = unique(my_data$mutation)
|
||||
my_data[!duplicated(my_data$mutation)]
|
||||
my_data_unique = my_data[!duplicated(my_data$mutation),]
|
||||
my_data[!duplicated('mutation'),]
|
||||
my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
|
||||
my_data_unique = my_data[!duplicated(my_data['mutation']),]
|
||||
getwd()
|
||||
setwd("/git/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
getwd()
|
||||
setwd("/git/github/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
|
||||
c = file.choose()
|
||||
c = file.choose(../Data_original)
|
||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F))
|
||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
|
||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
||||
# define output variables
|
||||
drug = 'pyrazinamide'
|
||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
||||
outdir = paste0("../mcsm_analysis/",drug,"/Data/")
|
||||
outFile = "meta_data_with_AFandOR.csv"
|
||||
output_filename = paste0(outdir, outFile)
|
||||
output_filename
|
Binary file not shown.
|
@ -1,7 +0,0 @@
|
|||
#!/usr/bin/python3
|
||||
# Initialise a blank 'Data' directory and drug subdirs etc.
|
||||
# TODO:
|
||||
# - Read base dir from config file
|
||||
# - Create eg: '~/git/Data/{original,processed}
|
||||
# - Create eg: '~/git/Data/processed/' + drug (for each drug)
|
||||
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
|
|
@ -1,241 +0,0 @@
|
|||
getwd()
|
||||
setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
|
||||
raw_data = infile[,c("id"
|
||||
, "pyrazinamide"
|
||||
, "dr_mutations_pyrazinamide"
|
||||
, "other_mutations_pyrazinamide")]
|
||||
|
||||
#####
|
||||
# 1a: exclude na
|
||||
#####
|
||||
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
|
||||
|
||||
total_samples = length(unique(raw_data$id))
|
||||
print(total_samples)
|
||||
|
||||
# sanity check: should be true
|
||||
is.numeric(total_samples)
|
||||
|
||||
#####
|
||||
# 1b: combine the two mutation columns
|
||||
#####
|
||||
raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
|
||||
, raw_data$other_mutations_pyrazinamide)
|
||||
head(raw_data$all_mutations_pyrazinamide)
|
||||
|
||||
#####
|
||||
# 1c: create yet another column that contains all the mutations but in lower case
|
||||
#####
|
||||
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
|
||||
|
||||
# sanity checks
|
||||
table(grepl("pnca_p",raw_data$all_muts_pnca))
|
||||
|
||||
# sanity check: should be TRUE
|
||||
sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
|
||||
|
||||
# set up variables: can be used for logistic regression as well
|
||||
i = "pnca_p.ala134gly" # has a NA, should NOT exist
|
||||
table(grepl(i,raw_data$all_muts_pnca))
|
||||
|
||||
i = "pnca_p.trp68gly"
|
||||
table(grepl(i,raw_data$all_muts_pnca))
|
||||
|
||||
mut = grepl(i,raw_data$all_muts_pnca)
|
||||
dst = raw_data$pyrazinamide
|
||||
table(mut, dst)
|
||||
|
||||
#chisq.test(table(mut,dst))
|
||||
#fisher.test(table(mut, dst))
|
||||
#table(mut)
|
||||
|
||||
###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
|
||||
pnca_snps_or = read.csv(file.choose()
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
# extract unique snps to iterate over for AF and OR calcs
|
||||
# total no of unique snps
|
||||
# AF and OR calculations
|
||||
|
||||
pnca_snps_unique = unique(pnca_snps_or$mutation)
|
||||
|
||||
# Define OR function
|
||||
x = as.numeric(mut)
|
||||
y = dst
|
||||
or = function(x,y){
|
||||
tab = as.matrix(table(x,y))
|
||||
a = tab[2,2]
|
||||
if (a==0){ a<-0.5}
|
||||
b = tab[2,1]
|
||||
if (b==0){ b<-0.5}
|
||||
c = tab[1,2]
|
||||
if (c==0){ c<-0.5}
|
||||
d = tab[1,1]
|
||||
if (d==0){ d<-0.5}
|
||||
(a/b)/(c/d)
|
||||
|
||||
}
|
||||
|
||||
dst = raw_data$pyrazinamide
|
||||
ors = sapply(pnca_snps_unique,function(m){
|
||||
mut = grepl(m,raw_data$all_muts_pnca)
|
||||
or(mut,dst)
|
||||
})
|
||||
|
||||
ors
|
||||
|
||||
pvals = sapply(pnca_snps_unique,function(m){
|
||||
mut = grepl(m,raw_data$all_muts_pnca)
|
||||
fisher.test(mut,dst)$p.value
|
||||
})
|
||||
|
||||
pvals
|
||||
|
||||
afs = sapply(pnca_snps_unique,function(m){
|
||||
mut = grepl(m,raw_data$all_muts_pnca)
|
||||
mean(mut)
|
||||
})
|
||||
|
||||
afs
|
||||
|
||||
# check ..hmmm
|
||||
afs['pnca_p.trp68gly']
|
||||
afs['pnca_p.gln10pro']
|
||||
afs['pnca_p.leu4ser']
|
||||
|
||||
#plot(density(log(ors)))
|
||||
#plot(-log10(pvals))
|
||||
#hist(log(ors)
|
||||
# ,breaks = 100
|
||||
# )
|
||||
|
||||
# subset df cols to add to the calc param df
|
||||
pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')]
|
||||
pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
|
||||
|
||||
rownames(pnca_snps_cols) = pnca_snps_cols$mutation
|
||||
head(rownames(pnca_snps_cols))
|
||||
#snps_with_AF_and_OR
|
||||
|
||||
# combine
|
||||
comb_AF_and_OR = data.frame(ors, pvals, afs)
|
||||
head(rownames(comb_AF_and_OR))
|
||||
|
||||
# sanity checks: should be the same
|
||||
dim(comb_AF_and_OR); dim(pnca_snps_cols)
|
||||
table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
|
||||
|
||||
table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
|
||||
|
||||
# merge the above two df whose dim you checked
|
||||
snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
|
||||
, by = "row.names"
|
||||
# , all.x = T
|
||||
)
|
||||
|
||||
#rm(pnca_snps_cols, pnca_snps_or, raw_data)
|
||||
|
||||
#===============
|
||||
# Step 3: Read data file where you will add the calculated OR
|
||||
# Note: this is the big file with one-many relationship between snps and lineages
|
||||
# i.e fname4 from 'pnca_extraction.py'
|
||||
#===============
|
||||
my_data = read.csv(file.choose()
|
||||
, row.names = 1
|
||||
, stringsAsFactors = FALSE)
|
||||
|
||||
head(my_data)
|
||||
length(unique(my_data$id))
|
||||
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
|
||||
# sanity check
|
||||
head(my_data$mutation)
|
||||
|
||||
# FILES TO MERGE:
|
||||
# comb_AF_and_OR: file containing OR
|
||||
# my_data = big meta data file
|
||||
# linking column: mutation
|
||||
|
||||
head(my_data)
|
||||
merged_df = merge(my_data # big file
|
||||
, snps_with_AF_and_OR # small (afor file)
|
||||
, by = "mutation"
|
||||
, all.x = T) # because you want all the entries of the meta data
|
||||
|
||||
# sanity checks: should be True
|
||||
# FIXME: I have checked this manually, but make it so it is a pass or a fail!
|
||||
comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors
|
||||
merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
|
||||
|
||||
merged_df[merged_df$Mutationinformation.x == "Q10P",]
|
||||
|
||||
# sanity check: very important!
|
||||
colnames(merged_df)
|
||||
|
||||
table(merged_df$mutation_info.x == merged_df$mutation_info.y)
|
||||
|
||||
#FIXME: what happened to other 7 and FALSE
|
||||
table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
|
||||
|
||||
# problem
|
||||
identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
|
||||
|
||||
#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
|
||||
|
||||
#throw away the y because that is a smaller df
|
||||
d1 = which(colnames(merged_df) == "mutation_info.y") #21
|
||||
d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
|
||||
|
||||
merged_df2 = merged_df[-c(d1, d2)] #3093 20
|
||||
colnames(merged_df2)
|
||||
|
||||
# rename cols
|
||||
colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
|
||||
colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
|
||||
|
||||
colnames(merged_df2)
|
||||
|
||||
# should be 0
|
||||
sum(is.na(merged_df2$Mutationinformation))
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
|
||||
# only some or and Af should be NA
|
||||
#Row.names ors pvals afs
|
||||
#81 81 81 81
|
||||
|
||||
|
||||
colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
|
||||
colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
|
||||
colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
|
||||
|
||||
colnames(merged_df2)
|
||||
|
||||
# add log OR and neglog pvalue
|
||||
merged_df2$logor = log(merged_df2$OR)
|
||||
is.numeric(merged_df2$logor)
|
||||
|
||||
merged_df2$neglog10pvalue = -log10(merged_df2$pvalue)
|
||||
is.numeric(merged_df2$neglog10pvalue)
|
||||
|
||||
# write file out
|
||||
#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
|
||||
|
||||
# define output variables
|
||||
drug = 'pyrazinamide'
|
||||
out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
|
||||
outFile = "meta_data_with_AFandOR.csv"
|
||||
output_filename = paste0(outdir, outFile)
|
||||
|
||||
write.csv(merged_df2, output_filename
|
||||
, row.names = F)
|
|
@ -1,626 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Aug 6 12:56:03 2019
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
# FIXME: include error checking to enure you only
|
||||
# concentrate on positions that have structural info?
|
||||
|
||||
#%% load libraries
|
||||
###################
|
||||
# load libraries
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
#import numpy as np
|
||||
|
||||
#from pandas.api.types import is_string_dtype
|
||||
#from pandas.api.types import is_numeric_dtype
|
||||
|
||||
# to create dir
|
||||
#my_dir = os.path.expanduser('~/some_dir')
|
||||
#make sure mcsm_analysis/ exists
|
||||
#or specify the output directory
|
||||
|
||||
#%%
|
||||
#%%
|
||||
#%%
|
||||
#========================================================
|
||||
# TASK: extract ALL pncA mutations from GWAS data
|
||||
#========================================================
|
||||
#%%
|
||||
####################
|
||||
# my working dir
|
||||
os.getcwd()
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||
os.getcwd()
|
||||
#%%
|
||||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||
#%%
|
||||
#NOTE: Out_dir MUST exis
|
||||
# User defined dir strpyrazinamide
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'pnca'
|
||||
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
|
||||
# = out_dir + drug
|
||||
data_dir = homedir + '/git/Data'
|
||||
|
||||
if not os.path.exists(data_dir):
|
||||
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
|
||||
os.makedirs(data_dir)
|
||||
die()
|
||||
|
||||
if not os.path.exists(out_dir):
|
||||
print('Error!', out_dir, 'does not exist. Please create it')
|
||||
exit()
|
||||
|
||||
#if not os.path.exists(work_dir):
|
||||
# print('creating dir that does not exist', 'dir_name:', work_dir)
|
||||
# os.makedirs(work_dir)
|
||||
else:
|
||||
print('Dir exists: Carrying on')
|
||||
|
||||
# now create sub dir structure within work_dir
|
||||
# pyrazinamide/mcsm_analysis
|
||||
|
||||
# we need three dir
|
||||
# Data
|
||||
# Scripts
|
||||
# Plotting
|
||||
# Results
|
||||
# Plots
|
||||
|
||||
# create a list of dir names
|
||||
#dir_names = ['Data', 'Scripts', 'Results']
|
||||
|
||||
|
||||
#for i in dir_names:
|
||||
# this_dir = (work_dir + '/' + i)
|
||||
# if not os.path.exists(this_dir):
|
||||
# print('creating dir that does not exist:', this_dir)
|
||||
# os.makedirs(this_dir)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# Create sub dirs
|
||||
# 1)
|
||||
# Scripts
|
||||
# Plotting
|
||||
#subdir_plotting = work_dir + '/Scripts/Plotting'
|
||||
#if not os.path.exists(subdir_plotting):
|
||||
# print('creating dir that does not exist:', subdir_plotting)
|
||||
# os.makedirs(subdir_plotting)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# 2)
|
||||
# Results
|
||||
# Plots
|
||||
#subdir_plots = work_dir + '/Results/Plots'
|
||||
#if not os.path.exists(subdir_plots):
|
||||
# print('creating dir that does not exist:', subdir_plots)
|
||||
# os.makedirs(subdir_plots)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# clear varaibles
|
||||
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
|
||||
|
||||
#exit()
|
||||
#%%
|
||||
#==============================================================================
|
||||
############
|
||||
# STEP 1: Read file original_tanushree_data_v2.csv
|
||||
############
|
||||
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
||||
meta_data = pd.read_csv(data_file, sep = ',')
|
||||
|
||||
# column names
|
||||
list(meta_data.columns)
|
||||
|
||||
# extract elevant columns to extract from meta data related to the pyrazinamide
|
||||
meta_data = meta_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, 'pyrazinamide'
|
||||
, 'dr_mutations_pyrazinamide'
|
||||
, 'other_mutations_pyrazinamide'
|
||||
]]
|
||||
|
||||
# checks
|
||||
total_samples = meta_data['id'].nunique() # 19265
|
||||
|
||||
# counts NA per column
|
||||
meta_data.isna().sum()
|
||||
|
||||
# glance
|
||||
meta_data.head()
|
||||
|
||||
# equivalent of table in R
|
||||
# pyrazinamide counts
|
||||
meta_data.pyrazinamide.value_counts()
|
||||
|
||||
#%%
|
||||
############
|
||||
# STEP 2: extract entries containing selected genes:
|
||||
# pyrazinamide: pnca_p.
|
||||
# in the dr_mutations and other mutations"
|
||||
# as we are interested in the mutations in the protein coding region only
|
||||
# (corresponding to a structure)
|
||||
# and drop the entries with NA
|
||||
#############
|
||||
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
|
||||
del(meta_pza)
|
||||
|
||||
##########################
|
||||
# pyrazinamide: pnca_p.
|
||||
##########################
|
||||
meta_data_pnca = meta_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, 'pyrazinamide'
|
||||
, 'dr_mutations_pyrazinamide'
|
||||
, 'other_mutations_pyrazinamide'
|
||||
]]
|
||||
|
||||
del(meta_data)
|
||||
|
||||
# sanity checks
|
||||
|
||||
# dr_mutations only
|
||||
meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pnca_dr['id'].nunique()
|
||||
del(meta_pnca_dr)
|
||||
|
||||
# other mutations
|
||||
meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pnca_other['id'].nunique()
|
||||
del(meta_pnca_other)
|
||||
|
||||
# Now extract "all" mutations
|
||||
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
|
||||
|
||||
meta_pnca_all['id'].nunique()
|
||||
pnca_samples = len(meta_pnca_all)
|
||||
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
|
||||
comp_pnca_samples = pnca_samples - pnca_na
|
||||
|
||||
#=#=#=#=#=#=#
|
||||
# COMMENT: use it later to check number of complete samples from LF data
|
||||
#=#=#=#=#=#=#
|
||||
|
||||
# sanity checks
|
||||
meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
|
||||
meta_pnca_all.other_mutations_pyrazinamide.value_counts()
|
||||
|
||||
# more sanity checks
|
||||
# !CAUTION!: muts will change depending on your gene
|
||||
|
||||
# dr muts : insert
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] #
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
|
||||
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
|
||||
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
|
||||
|
||||
# other_muts
|
||||
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
|
||||
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
|
||||
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
# FIXME
|
||||
# COMMENTS: both mutations columns are separated by ;
|
||||
# CHECK if there are mutations that exist both in dr and other_muts!
|
||||
# doesn't make any sense for the same mut to exist in both, I would have thought!
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
|
||||
# remove not required variables
|
||||
del(meta_data_pnca)
|
||||
|
||||
############
|
||||
# STEP 3: split the columns of
|
||||
# a) dr_mutation_... (;) as
|
||||
# the column has snps related to multiple genes.
|
||||
# useful links
|
||||
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
|
||||
# this one works beautifully
|
||||
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
|
||||
############
|
||||
|
||||
# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
|
||||
meta_pnca_all.isna().sum()
|
||||
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
# COMMENT: no NA's in dr_mutations/other_mutations_columns
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
# define the split function
|
||||
def tidy_split(df, column, sep='|', keep=False):
|
||||
"""
|
||||
Split the values of a column and expand so the new DataFrame has one split
|
||||
value per row. Filters rows where the column is missing.
|
||||
|
||||
Params
|
||||
------
|
||||
df : pandas.DataFrame
|
||||
dataframe with the column to split and expand
|
||||
column : str
|
||||
the column to split and expand
|
||||
sep : str
|
||||
the string used to split the column's values
|
||||
keep : bool
|
||||
whether to retain the presplit value as it's own row
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
Returns a dataframe with the same columns as `df`.
|
||||
"""
|
||||
indexes = list()
|
||||
new_values = list()
|
||||
#df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
|
||||
for i, presplit in enumerate(df[column].astype(str)):
|
||||
values = presplit.split(sep)
|
||||
if keep and len(values) > 1:
|
||||
indexes.append(i)
|
||||
new_values.append(presplit)
|
||||
for value in values:
|
||||
indexes.append(i)
|
||||
new_values.append(value)
|
||||
new_df = df.iloc[indexes, :].copy()
|
||||
new_df[column] = new_values
|
||||
return new_df
|
||||
|
||||
########
|
||||
# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
|
||||
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
||||
########
|
||||
meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';')
|
||||
|
||||
# remove leading white space else these are counted as distinct mutations as well
|
||||
meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip()
|
||||
|
||||
########
|
||||
# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
|
||||
########
|
||||
meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';')
|
||||
|
||||
# remove the leading white spaces in the column
|
||||
meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip()
|
||||
|
||||
##########
|
||||
# Step 4: Reshape data so that all mutations are in one column and the
|
||||
# annotations for the mutation reflect the column name
|
||||
# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
|
||||
|
||||
# data frame “df” is passed to melt() function
|
||||
# id_vars is the variable which need to be left unaltered
|
||||
# var_name are the column names so we named it as 'mutation_info'
|
||||
# value_name are its values so we named it as 'mutation'
|
||||
##########
|
||||
meta_pnca_WF1.columns
|
||||
|
||||
meta_pnca_LF0 = pd.melt(meta_pnca_WF1
|
||||
, id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
|
||||
, var_name = 'mutation_info'
|
||||
, value_name = 'mutation')
|
||||
|
||||
# sanity check: should be true
|
||||
if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
|
||||
print('sanity check passed: Long format df has the expected length')
|
||||
else:
|
||||
print("Sanity check failed: Debug please!")
|
||||
|
||||
###########
|
||||
# Step 5: This is still dirty data. Filter LF data so that you only have
|
||||
# mutations corresponding to pnca_p.
|
||||
# this will be your list you run OR calcs
|
||||
###########
|
||||
meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')]
|
||||
|
||||
# sanity checks
|
||||
# unique samples
|
||||
meta_pnca_LF1['id'].nunique()
|
||||
if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
|
||||
print("Sanity check passed: No of samples with pncA mutations match")
|
||||
else:
|
||||
print("Sanity check failed: Debug please!")
|
||||
|
||||
# count if all the mutations are indeed in the protein coding region
|
||||
# i.e begin with pnca_p
|
||||
meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
|
||||
|
||||
# should be true.
|
||||
# and check against the length of the df, which should match
|
||||
if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
|
||||
print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
|
||||
else:
|
||||
print("Sanity check failed: Debug please!")
|
||||
|
||||
###########
|
||||
# Step 6: Filter dataframe with "na" in the drug column
|
||||
# This is because for OR, you can't use the snps that have the
|
||||
# NA in the specified drug column
|
||||
# it creates problems when performing calcs in R inside the loop
|
||||
# so best to filter it here
|
||||
###########
|
||||
# NOT NEEDED FOR all snps, only for extracting valid OR snps
|
||||
del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
|
||||
|
||||
###########
|
||||
# Step 7: count unique pncA_p mutations (all and comp cases)
|
||||
###########
|
||||
meta_pnca_LF1['mutation'].nunique()
|
||||
meta_pnca_LF1.groupby('mutation_info').nunique()
|
||||
|
||||
meta_pnca_LF1['id'].nunique()
|
||||
meta_pnca_LF1['mutation'].nunique()
|
||||
meta_pnca_LF1.groupby('id').nunique()
|
||||
|
||||
###########
|
||||
# Step 8: convert all snps only (IN LOWERCASE)
|
||||
# because my mcsm file intergated has lowercase
|
||||
###########
|
||||
# convert mutation to lower case as it needs to exactly match the dict key
|
||||
#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
|
||||
meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
|
||||
|
||||
###########
|
||||
# Step 9 : Split 'mutation' column into three: wild_type, position and
|
||||
# mutant_type separately. Then map three letter code to one from the
|
||||
# referece_dict imported pncaeady. First convert to mutation to lowercase
|
||||
# to allow to match entries from dict
|
||||
###########
|
||||
#=======
|
||||
# Step 9a: iterate through the dict, create a lookup dict i.e
|
||||
# lookup_dict = {three_letter_code: one_letter_code}.
|
||||
# lookup dict should be the key and the value (you want to create a column for)
|
||||
# Then use this to perform the mapping separetly for wild type and mutant cols.
|
||||
# The three letter code is extracted using a regex match from the dataframe and then converted
|
||||
# to 'pandas series'since map only works in pandas series
|
||||
#=======
|
||||
# initialise a sub dict that is a lookup dict for three letter code to one
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
||||
meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)
|
||||
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
|
||||
|
||||
# extract position info from mutation column separetly using regex
|
||||
meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)')
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#=========
|
||||
# Step 9b: iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_water}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
lookup_dict = dict()
|
||||
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_water']
|
||||
#print(lookup_dict)
|
||||
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
||||
meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)
|
||||
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
|
||||
|
||||
# added two more cols
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#========
|
||||
# Step 9c: iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
lookup_dict = dict()
|
||||
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_polarity']
|
||||
#print(lookup_dict)
|
||||
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
||||
meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
|
||||
mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
|
||||
meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
|
||||
|
||||
# added two more cols
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
########
|
||||
# Step 10: combine the wild_type+poistion+mutant_type columns to generate
|
||||
# Mutationinformation (matches mCSM output field)
|
||||
# Remember to use .map(str) for int col types to allow string concatenation
|
||||
#########
|
||||
meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
|
||||
|
||||
#=#=#=#=#=#=#
|
||||
# Step 11:
|
||||
# COMMENT: there is more processing in the older version of this script
|
||||
# consult if necessary
|
||||
# possibly due to the presence of true_wt
|
||||
# since this file doesn't contain any true_wt, we won't need it(hopefully!)
|
||||
#=#=#=#=#=#=#
|
||||
|
||||
#%%
|
||||
###########
|
||||
# Step 12: Output files for only SNPs to run mCSM
|
||||
###########
|
||||
|
||||
#=========
|
||||
# Step 12a: all SNPs to run mCSM
|
||||
#=========
|
||||
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
|
||||
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
|
||||
|
||||
# assign meaningful colnames
|
||||
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
|
||||
#list(snps_only.columns)
|
||||
snps_only.isna().sum() # should be 0
|
||||
|
||||
# output csv: all SNPS for mCSM analysis
|
||||
# specify variable name for output file
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname1 = '_snps_'
|
||||
nrows = len(snps_only)
|
||||
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
|
||||
if not os.path.exists(output_file_path):
|
||||
print( output_file_path, 'does not exist. Creating')
|
||||
os.makedirs(output_file_path)
|
||||
exit()
|
||||
|
||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
||||
print(output_filename) #<<<- check
|
||||
|
||||
# write to csv: without column or row names
|
||||
# Bad practice: numbers at the start of a filename
|
||||
snps_only.to_csv(output_filename, header = False, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12b: all snps with annotation
|
||||
#=========
|
||||
# all snps, selected cols
|
||||
#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
|
||||
# , 'drtype', 'pyrazinamide'
|
||||
# , 'mutation_info', 'mutation', 'Mutationinformation']]
|
||||
|
||||
#len(pnca_snps_ALL)
|
||||
|
||||
# sanity check
|
||||
#meta_pnca_LF1['mutation'].nunique()
|
||||
|
||||
# output csv: WITH column but WITHOUT row names(all snps with meta data)
|
||||
# specify variable name for output file
|
||||
#gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
#my_fname2 = '_snps_with_metadata_'
|
||||
#nrows = len(pnca_snps_ALL)
|
||||
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
|
||||
#print(output_filename) #<<<- check
|
||||
|
||||
# write out file
|
||||
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12c: comp snps for OR calcs with annotation
|
||||
#=========
|
||||
# remove all NA's from pyrazinamide column from LF1
|
||||
|
||||
# counts NA per column
|
||||
meta_pnca_LF1.isna().sum()
|
||||
|
||||
# remove NA
|
||||
meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
|
||||
|
||||
# sanity checks
|
||||
# should be True
|
||||
len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
|
||||
|
||||
# unique counts
|
||||
meta_pnca_LF2['mutation'].nunique()
|
||||
|
||||
meta_pnca_LF2.groupby('mutation_info').nunique()
|
||||
|
||||
# sanity check
|
||||
meta_pnca_LF2['id'].nunique()
|
||||
|
||||
# should be True
|
||||
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
|
||||
print ('sanity check passed: complete numbers match')
|
||||
else:
|
||||
print('Error: Please Debug!')
|
||||
|
||||
# value counts
|
||||
meta_pnca_LF2.mutation.value_counts()
|
||||
#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
|
||||
|
||||
# valid/comp snps
|
||||
# uncomment as necessary
|
||||
pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
||||
len(pnca_snps_COMP)
|
||||
|
||||
# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
|
||||
# specify variable name for output file
|
||||
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname3 = '_comp_snps_with_metadata_'
|
||||
nrows = len(pnca_snps_COMP)
|
||||
|
||||
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
|
||||
#print(output_filename) #<<<-check
|
||||
|
||||
# write out file
|
||||
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
|
||||
|
||||
|
||||
#=========
|
||||
# Step 12d: comp snps only
|
||||
#=========
|
||||
# output csv: comp SNPS for info (i.e snps for which OR exist)
|
||||
# specify variable name for output file
|
||||
|
||||
snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
||||
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname1 = '_comp_snps_'
|
||||
nrows = len(snps_only)
|
||||
|
||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
||||
print(output_filename) #<<<- check
|
||||
|
||||
# write to csv: without column or row names
|
||||
snps_only.to_csv(output_filename, header = False, index = False)
|
||||
|
||||
|
||||
#=#=#=#=#=#=#=#
|
||||
# COMMENT: LF1 is the file to extract all unique snps for mcsm
|
||||
# but you have that already in file called pnca_snps...
|
||||
# LF2: is the file for extracting snps tested for DS and hence OR calcs
|
||||
#=#=#=#=#=#=#=#
|
||||
|
||||
###########
|
||||
# Step 13 : Output the whole df i.e
|
||||
# file for meta_data which is now formatted with
|
||||
# each row as a unique snp rather than the original version where
|
||||
# each row is a unique id
|
||||
# ***** This is the file you will ADD the AF and OR calculations to *****
|
||||
###########
|
||||
|
||||
# output csv: the entire DF
|
||||
# specify variable name for output file
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname4 = '_metadata'
|
||||
#nrows = len(meta_pnca_LF1)
|
||||
output_filename = output_file_path + gene + my_fname4 + '.csv'
|
||||
print(output_filename) #<<<-check
|
||||
|
||||
# write out file
|
||||
meta_pnca_LF1.to_csv(output_filename)
|
|
@ -1,121 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jun 18 11:32:28 2019
|
||||
|
||||
@author: tanushree
|
||||
"""
|
||||
############################################
|
||||
#load libraries
|
||||
import pandas as pd
|
||||
import os
|
||||
#############################################
|
||||
|
||||
#!#########################!
|
||||
# REQUIREMNETS:
|
||||
# Data_original/ must exist
|
||||
# containing GWAS data
|
||||
#!#########################!
|
||||
|
||||
print(os.getcwd())
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
os.chdir(homedir + '/git/Data/input/original')
|
||||
print(os.getcwd())
|
||||
#==========
|
||||
#read file
|
||||
#==========
|
||||
my_aa = pd.read_csv('aa_codes.csv') #20, 6
|
||||
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
|
||||
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
||||
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
||||
|
||||
#=========================================================
|
||||
#convert file to dict of dicts
|
||||
#=========================================================
|
||||
#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
|
||||
#with your choice of column name that you have assigned to index as the "primary key".
|
||||
#using 'index' creates a dict of dicts
|
||||
#using 'records' creates a list of dicts
|
||||
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
||||
|
||||
#================================================
|
||||
#dict of aa with their corresponding properties
|
||||
#This is defined twice
|
||||
#================================================
|
||||
#7 categories: no overlap
|
||||
qualities1 = { ('R', 'H', 'K'): 'Basic'
|
||||
, ('D', 'E'): 'Acidic'
|
||||
, ('N', 'Q'): 'Amidic'
|
||||
, ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
|
||||
, ('S', 'T'): 'Hydroxylic'
|
||||
, ('F', 'W', 'Y'): 'Aromatic'
|
||||
, ('C', 'M'): 'Sulphur'
|
||||
}
|
||||
|
||||
#9 categories: allowing for overlap
|
||||
qualities2 = { ('R', 'H', 'K'): 'Basic'
|
||||
, ('D', 'E'): 'Acidc'
|
||||
, ('S', 'T', 'N', 'Q'): 'Polar'
|
||||
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
|
||||
, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
|
||||
, ('S', 'G', 'A', 'P'): 'Small'
|
||||
, ('F', 'W', 'Y', 'H'): 'Aromatic'
|
||||
, ('V', 'I', 'L', 'M'): 'Aliphatic'
|
||||
, ('C', 'G', 'P'): 'Special'
|
||||
}
|
||||
|
||||
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
|
||||
, ('D', 'E'): 'Acidc'
|
||||
, ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
|
||||
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
|
||||
#, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
|
||||
, ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small'
|
||||
, ('F', 'W', 'Y', 'H'): 'Aromatic'
|
||||
, ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
|
||||
, ('C', 'G', 'P'): 'Special'
|
||||
}
|
||||
|
||||
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
|
||||
, ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
|
||||
}
|
||||
|
||||
qualities_polarity = { ('D', 'E'): 'acidic'
|
||||
, ('H', 'K', 'R'): 'basic'
|
||||
, ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
|
||||
, ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
|
||||
}
|
||||
|
||||
#==============================================================================
|
||||
#adding amino acid properties to my dict of dicts
|
||||
for k, v in my_aa_dict.items():
|
||||
#print (k,v)
|
||||
v['aa_prop1'] = str() #initialise keys
|
||||
v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
|
||||
v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
|
||||
v['aa_prop_water'] = str() #initialise keys
|
||||
v['aa_prop_polarity'] = str() #initialise keys
|
||||
|
||||
for group in qualities1:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop1']+= qualities1[group] # += for str concat
|
||||
|
||||
for group in qualities2:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop2'].append(qualities2[group]) # append to list
|
||||
|
||||
for group in qualities_taylor:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_taylor'].append(qualities_taylor[group]) # append to list
|
||||
|
||||
for group in qualities_water:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop_water']+= qualities_water[group] # += for str concat
|
||||
|
||||
for group in qualities_polarity:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat
|
||||
|
||||
#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
|
||||
#==============================================================================
|
||||
|
||||
|
|
@ -4,9 +4,6 @@
|
|||
## Structure:
|
||||
#
|
||||
# $DATA_DIR/$DRUG/input
|
||||
# |- original
|
||||
# |- processed
|
||||
# |- structure
|
||||
#
|
||||
# $DATA_DIR/$DRUG/output
|
||||
# |- plots
|
||||
|
@ -15,18 +12,17 @@
|
|||
DATA_DIR=~/git/Data
|
||||
|
||||
if [[ $1 == '' ]]; then
|
||||
echo "Error"
|
||||
echo "usage: mk-drug-dirs.sh <drug name>";
|
||||
exit;
|
||||
else
|
||||
DRUG=$1
|
||||
echo Creating structure for: $DRUG
|
||||
echo Creating directory structure for: $DRUG
|
||||
|
||||
if [ -d $DATA_DIR ]
|
||||
then
|
||||
echo Doing creation in $DATA_DIR
|
||||
mkdir -p $DATA_DIR/$DRUG/input/original
|
||||
mkdir -p $DATA_DIR/$DRUG/input/processed
|
||||
mkdir -p $DATA_DIR/$DRUG/input/structure
|
||||
mkdir -p $DATA_DIR/$DRUG/input
|
||||
mkdir -p $DATA_DIR/$DRUG/output/plots
|
||||
mkdir -p $DATA_DIR/$DRUG/output/structure
|
||||
|
||||
|
|
|
@ -1,25 +1,36 @@
|
|||
#########################################################
|
||||
### A) Installing and loading required packages
|
||||
#########################################################
|
||||
#lib_loc = "/usr/local/lib/R/site-library")
|
||||
|
||||
#if (!require("gplots")) {
|
||||
# install.packages("gplots", dependencies = TRUE)
|
||||
# library(gplots)
|
||||
#}
|
||||
|
||||
if (!require("tidyverse")) {
|
||||
install.packages("tidyverse", dependencies = TRUE)
|
||||
library(tidyverse)
|
||||
}
|
||||
#if (!require("tidyverse")) {
|
||||
# install.packages("tidyverse", dependencies = TRUE)
|
||||
# library(tidyverse)
|
||||
#}
|
||||
|
||||
if (!require("ggplot2")) {
|
||||
install.packages("ggplot2", dependencies = TRUE)
|
||||
library(ggplot2)
|
||||
}
|
||||
|
||||
if (!require("ggridges")) {
|
||||
install.packages("ggridges", dependencies = TRUE)
|
||||
library(ggridges)
|
||||
}
|
||||
|
||||
if (!require("plotly")) {
|
||||
install.packages("plotly", dependencies = TRUE)
|
||||
library(plotly)
|
||||
}
|
||||
|
||||
if (!require("cowplot")) {
|
||||
install.packages("copwplot", dependencies = TRUE)
|
||||
library(ggplot2)
|
||||
library(cowplot)
|
||||
}
|
||||
|
||||
if (!require("ggcorrplot")) {
|
||||
|
@ -43,37 +54,33 @@ if (!require ("GOplot")) {
|
|||
}
|
||||
|
||||
if(!require("VennDiagram")) {
|
||||
|
||||
install.packages("VennDiagram", dependencies = T)
|
||||
library(VennDiagram)
|
||||
}
|
||||
|
||||
if(!require("scales")) {
|
||||
|
||||
install.packages("scales", dependencies = T)
|
||||
library(scales)
|
||||
}
|
||||
|
||||
if(!require("plotrix")) {
|
||||
|
||||
install.packages("plotrix", dependencies = T)
|
||||
library(plotrix)
|
||||
}
|
||||
|
||||
if(!require("stats")) {
|
||||
|
||||
install.packages("stats", dependencies = T)
|
||||
library(stats)
|
||||
}
|
||||
|
||||
if(!require("stats4")) {
|
||||
|
||||
install.packages("stats4", dependencies = T)
|
||||
library(stats4)
|
||||
}
|
||||
|
||||
if(!require("data.table")) {
|
||||
library(stats4)
|
||||
install.packages("data.table")
|
||||
library(data.table)
|
||||
}
|
||||
|
||||
if (!require("PerformanceAnalytics")){
|
||||
|
@ -98,18 +105,17 @@ if (!require ("psych")){
|
|||
|
||||
if (!require ("dplyr")){
|
||||
install.packages("dplyr")
|
||||
library(psych)
|
||||
library(dplyr)
|
||||
}
|
||||
|
||||
|
||||
if (!require ("compare")){
|
||||
install.packages("compare")
|
||||
library(psych)
|
||||
library(compare)
|
||||
}
|
||||
|
||||
if (!require ("arsenal")){
|
||||
install.packages("arsenal")
|
||||
library(psych)
|
||||
library(arsenal)
|
||||
}
|
||||
|
||||
|
||||
|
@ -118,7 +124,7 @@ if (!require ("arsenal")){
|
|||
#if(!require(devtools)) install.packages("devtools")
|
||||
#devtools::install_github("kassambara/ggcorrplot")
|
||||
|
||||
library(ggcorrplot)
|
||||
#library(ggcorrplot)
|
||||
|
||||
|
||||
###for PDB files
|
13
scripts/DOCS
Normal file
13
scripts/DOCS
Normal file
|
@ -0,0 +1,13 @@
|
|||
dir structure
|
||||
|
||||
~/git/Data
|
||||
aa_codes.csv
|
||||
|
||||
~/git/Data/<drug>/input
|
||||
~/git/Data/<drug>/output
|
||||
|
||||
data_extraction.py
|
||||
must have the dirs else creates it
|
||||
in the curr dir needs
|
||||
reference_dict.py
|
||||
tidy_split.py
|
178
scripts/aa_code.py
Normal file
178
scripts/aa_code.py
Normal file
|
@ -0,0 +1,178 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on Mon June 14 2021
|
||||
|
||||
@author: tanu
|
||||
'''
|
||||
# FIXME: import dirs.py to get the basic dir paths available
|
||||
#=======================================================================
|
||||
# TASK
|
||||
|
||||
# Input:
|
||||
|
||||
# Output:
|
||||
#=======================================================================
|
||||
#%% load libraries
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
import re
|
||||
#import numpy as np
|
||||
import argparse
|
||||
DEBUG = False
|
||||
#=======================================================================
|
||||
#%% specify input and curr dir
|
||||
homedir = os.path.expanduser('~')
|
||||
|
||||
# set working dir
|
||||
os.getcwd()
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
os.getcwd()
|
||||
|
||||
from reference_dict import oneletter_aa_dict
|
||||
from reference_dict import low_3letter_dict
|
||||
#=======================================================================
|
||||
#%%###########################################################################
|
||||
# FUNCTION: using mcsm mutation format to split mutation info into
|
||||
# 2 separate columns for wt 3 letter lowecase and mut 3 letter lowercase
|
||||
###############################################################################
|
||||
|
||||
def get_aa_3lower(df, wt_colname = 'wild_type', mut_colname = 'mutant_type', col_wt = 'wt_aa_3lower', col_mut = 'mut_aa_3lower'):
|
||||
|
||||
""" Add 3 letter lowercase aa code for wt and mutant residues specified as 1 letter uppercase aa code
|
||||
|
||||
@df: df containing one letter aa code for wt and mutant respectively
|
||||
@type: pandas df
|
||||
|
||||
@wt_colname: column containing one letter wild type aa
|
||||
@type: str
|
||||
|
||||
@mut_colname: column containing one letter mutant type aa
|
||||
@type: str
|
||||
|
||||
@col_wt: column with 3 letter aa code lower for wild type aa
|
||||
@type: str
|
||||
|
||||
@col_mut: column with 3 letter aa code lower for mutant type aa
|
||||
@type: str
|
||||
|
||||
returns df: with 2 added columns. If column names clash, the function column
|
||||
name will override original column
|
||||
@rtype: pandas df
|
||||
"""
|
||||
|
||||
lookup_dict_aa_3lower = dict()
|
||||
|
||||
for k, v in oneletter_aa_dict.items():
|
||||
|
||||
lookup_dict_aa_3lower[k] = v['three_letter_code_lower']
|
||||
#if DEBUG:
|
||||
# print('Key:', k
|
||||
# , 'Value:', v
|
||||
# , '\n=====================================================\n'
|
||||
# , '\nDICT:', lookup_dict_aa_3lower :\n')
|
||||
|
||||
df[col_wt] = df[wt_colname].map(lookup_dict_aa_3lower)
|
||||
df[col_mut] = df[mut_colname].map(lookup_dict_aa_3lower)
|
||||
|
||||
return df
|
||||
#%%
|
||||
#==================================
|
||||
# example: get_aa_3upper()
|
||||
#==================================
|
||||
# test_filename = '/home/tanu/git/Data/streptomycin/output/gid_complex_mcsm_norm_SAM.csv'
|
||||
# test_df = pd.read_csv(test_filename , sep = ',')
|
||||
|
||||
# my_wt_colname = 'wild_type'
|
||||
# my_mut_colname = 'mutant_type'
|
||||
# my_col1 = 'wt_aa_3lower'
|
||||
# my_col2 = 'mut_aa_3lower'
|
||||
|
||||
# get_aa_3lower(df = test_df
|
||||
# , wt_colname = my_wt_colname
|
||||
# , mut_colname = my_mut_colname
|
||||
# , col_wt = my_col1
|
||||
# , col_mut = my_col2)
|
||||
#%%###########################################################################
|
||||
# FUNCTION: using gwas mutation format to split mutation info into
|
||||
# 3 separate columns for wild type, position and mutation
|
||||
###############################################################################
|
||||
def get_aa_1upper(df
|
||||
, gwas_mut_colname = 'mutation'
|
||||
, wt_colname = 'wt_aa_1upper'
|
||||
, pos_colname = 'position'
|
||||
, mut_colname = 'mut_aa_1upper'):
|
||||
|
||||
"""Add 1 letter aa uppercase aa code for wt and mutant residues specified as 3 letter lowercase aa code
|
||||
|
||||
@df: df containing one letter aa code for wt and mutant respectively
|
||||
@type: pandas df
|
||||
|
||||
@wt_regex: regex string matching three letter lowercase aa code
|
||||
@type:regex
|
||||
|
||||
@pos_regex: regex string matching aa position
|
||||
@type:regex
|
||||
|
||||
@mut_regex: regex string matching three letter lowercase aa code
|
||||
@type: regex
|
||||
|
||||
@wt_colname: column containing one letter wild type aa
|
||||
@type: str
|
||||
|
||||
@mut_colname: column containing one letter mutant type aa
|
||||
@type: str
|
||||
|
||||
@wt_colname: column with 3 letter aa code lower for wild type aa
|
||||
@type: str
|
||||
|
||||
@pos_colname: column with aa position
|
||||
@type: int
|
||||
|
||||
@mut_colname: column with 3 letter aa code lower for mutant type aa
|
||||
@type: str
|
||||
|
||||
returns df: with 3 added columns. If column names clash, the function column
|
||||
name will override original column
|
||||
@rtype: pandas df
|
||||
"""
|
||||
|
||||
# static regex
|
||||
gwas_regex = r'^.*_p\.([A-Za-z]{3})([0-9]+)([A-Za-z]{3})$'
|
||||
|
||||
gwas_wt = df[gwas_mut_colname].str.extract(gwas_regex)[0]
|
||||
gwas_pos = df[gwas_mut_colname].str.extract(gwas_regex)[1]
|
||||
gwas_mut = df[gwas_mut_colname].str.extract(gwas_regex)[2]
|
||||
|
||||
lookup_dict_aa_1upper = dict()
|
||||
for k, v in low_3letter_dict.items():
|
||||
|
||||
lookup_dict_aa_1upper[k] = v['one_letter_code']
|
||||
#if DEBUG:
|
||||
# print('Key:', k
|
||||
# , 'Value:', v
|
||||
# , '\n======================================================\n'
|
||||
# , '\nDICT:', lookup_dict_aa_1upper :\n')
|
||||
|
||||
# wild type
|
||||
df[wt_colname] = gwas_wt.map(lookup_dict_aa_1upper)
|
||||
|
||||
# position
|
||||
df[pos_colname] = gwas_pos
|
||||
|
||||
# mutant type
|
||||
df[mut_colname] = gwas_mut.map(lookup_dict_aa_1upper)
|
||||
|
||||
return df
|
||||
#%%
|
||||
#==================================
|
||||
# example: get_aa_1upper()
|
||||
#==================================
|
||||
# test_filename2 = '/home/tanu/git/Data/streptomycin/output/gid_af_or.csv'
|
||||
# test_df2 = pd.read_csv(test_filename2 , sep = ',')
|
||||
|
||||
# get_aa_1upper(df = test_df2
|
||||
# , gwas_mut_colname = 'mutation'
|
||||
# , wt_colname = 'wild_type'
|
||||
# , pos_colname = 'position'
|
||||
# , mut_colname = 'mutant_type')
|
85
scripts/aa_index/aa_index.R
Normal file
85
scripts/aa_index/aa_index.R
Normal file
|
@ -0,0 +1,85 @@
|
|||
library(bio3d)
|
||||
library(seqinr)
|
||||
library(bios2mds)
|
||||
library(protr)
|
||||
#############################################################
|
||||
#%% TASK
|
||||
# use this to return df for AA index and mutation properties
|
||||
|
||||
source()
|
||||
|
||||
##############################################################
|
||||
my_fasta_file = "~/git/Data/streptomycin/input/gid_complex.fasta"
|
||||
my_mcsmf_snps = "~/git/Data/streptomycin/output/gid_mcsm_formatted_snps.csv"
|
||||
###############################################################
|
||||
#%% fasta as vector
|
||||
gid_aa_seq_v= read.fasta(my_fasta_file
|
||||
, seqtype = "AA"
|
||||
, as.string = F)
|
||||
|
||||
gid_aa_v = as.character(gid_aa_seq_v[[1]]); gid_aa_v
|
||||
|
||||
#%% fasta as string
|
||||
gid_aa_seq_s = read.fasta(my_fasta_file
|
||||
, seqtype = "AA"
|
||||
, as.string = T)
|
||||
|
||||
gid_aa_s = as.character(gid_aa_seq_s[[1]]); gid_aa_s
|
||||
###############################################################
|
||||
#===================
|
||||
# AA indices
|
||||
# https://www.genome.jp/aaindex/AAindex/list_of_indices
|
||||
#===================
|
||||
data(aa.index)
|
||||
|
||||
# default
|
||||
aai_kd = aa2index(gid_aa_v, index = "KYTJ820101") # Hydropathy, KD
|
||||
|
||||
aai_rv = aa2index(gid_aa_v, index = "BIGC670101") # Residue volume, Bigelow, 1967
|
||||
aai_rv2 = aa2index(gid_aa_v, index = "GOLD730102") # Residue volume (Goldsack-Chalifoux, 1973)
|
||||
aai_b = aa2index(gid_aa_v, index = "VENT840101") # Bitterness (Venanzi, 1984)
|
||||
|
||||
par(mfrow = c(1,1))
|
||||
barplot(aai_kd)
|
||||
barplot(aai_rv)
|
||||
barplot(aai_rv2)
|
||||
#barplot(aai_b, col = c("black", "yellow"))
|
||||
|
||||
##########################################################
|
||||
#===================
|
||||
# mutation matrices
|
||||
#===================
|
||||
data(sub.mat)
|
||||
snps = read.csv(my_mcsmf_snps
|
||||
, header = 0)
|
||||
snps
|
||||
colnames(snps) <- "mutationinformation"
|
||||
|
||||
# run using all matrices
|
||||
sub_mat_names = as.character(unlist(attributes(sub.mat)))
|
||||
#sub_mat_names = "BLOSUM80"
|
||||
|
||||
for (j in sub_mat_names){
|
||||
print(j)
|
||||
snps[[j]] <- NA
|
||||
for (i in 1:nrow(snps)) {
|
||||
curr_snp = snps$mutationinformation[i]
|
||||
m1 = str_match(curr_snp, "^([A-Z]{1})[0-9]*([A-Z]{1})")
|
||||
aa1 = m1[,2]
|
||||
aa2 = m1[,3]
|
||||
#snps$blosum_80[i]
|
||||
snps[[j]][i] = sub.mat[[j]][aa1,aa2]
|
||||
}
|
||||
|
||||
}
|
||||
snps
|
||||
##########################################################
|
||||
gid_aac = extractAAC(gid_aa_s)
|
||||
gid_dc = extractDC(gid_aa_s)
|
||||
gid_tc = extractTC(gid_aa_s)
|
||||
|
||||
par(mfrow = c(1, 3))
|
||||
barplot(gid_aac)
|
||||
barplot(gid_dc)
|
||||
barplot(gid_tc)
|
||||
###########################################################
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue